From 20a5b6d31b6bb82da624ee1c48664c0dcbaf9e38 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Thu, 28 Mar 2024 09:13:42 +0000
Subject: [PATCH 01/51] init

---
 src/executor/executor.cc | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 src/executor/executor.cc
diff --git a/src/executor/executor.cc b/src/executor/executor.cc
new file mode 100644
index 000000000..71705089e
--- /dev/null
+++ b/src/executor/executor.cc
@@ -0,0 +1,4 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+// This is used for execute json file generated by msccl scheduler

From abd4b3c35822fd9113db32ae741f4b6eb0bc173e Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Fri, 29 Mar 2024 07:34:26 +0000
Subject: [PATCH 02/51] init

---
 src/executor/execution_kernel.cu | 17 ++++++++
 src/executor/execution_plan.cpp  |  3 ++
 src/executor/executor.cc         |  9 ++++
 src/include/execution_plan.hpp   | 70 ++++++++++++++++++++++++++++++++
 src/include/executor.hpp         | 38 +++++++++++++++++
 5 files changed, 137 insertions(+)
 create mode 100644 src/executor/execution_kernel.cu
 create mode 100644 src/executor/execution_plan.cpp
 create mode 100644 src/include/execution_plan.hpp
 create mode 100644 src/include/executor.hpp

diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu
new file mode 100644
index 000000000..f7b79315e
--- /dev/null
+++ b/src/executor/execution_kernel.cu
@@ -0,0 +1,17 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "execution_plan.hpp"
+
+extern __shared__ mscclpp::DeviceExecutionPlan sharedMem[];
+
+__global__ void commnuication_kernel(void* sendbuff, void* recvbuff, void* scratchbuff) {
+  // read data from shared memory
+  // 1. get the number of command from shared memory
+  int nOps = sharedMem->nOperations;
+  for (int opId= 0; opId < nOps; opId++) {
+    // 2. get the command
+    mscclpp::Operation* op = sharedMem->operations + opId;
+    // 3. execute the command
+  }
+}
diff --git a/src/executor/execution_plan.cpp b/src/executor/execution_plan.cpp
new file mode 100644
index 000000000..048847f54
--- /dev/null
+++ b/src/executor/execution_plan.cpp
@@ -0,0 +1,3 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
diff --git a/src/executor/executor.cc b/src/executor/executor.cc
index 71705089e..de9e8ebd7 100644
--- a/src/executor/executor.cc
+++ b/src/executor/executor.cc
@@ -2,3 +2,12 @@
 // Licensed under the MIT license.
 
 // This is used for execute json file generated by msccl scheduler
+
+#include "executor.hpp"
+
+namespace mscclpp {
+
+void Executor::Impl::launchKernel() {
+}
+
+}  // namespace mscclpp
diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp
new file mode 100644
index 000000000..aa964ae1a
--- /dev/null
+++ b/src/include/execution_plan.hpp
@@ -0,0 +1,70 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef MSCCLPP_EXECUTOR_PLAN_HPP_
+#define MSCCLPP_EXECUTOR_PLAN_HPP_
+
+#include <mscclpp/core.hpp>
+#include <mscclpp/proxy_channel.hpp>
+#include <mscclpp/sm_channel.hpp>
+
+#include <string>
+
+namespace mscclpp {
+
+constexpr int MAX_CHANNEL = 24;
+constexpr int MAX_CHANNEL_PER_OPERATION = 8;
+
+enum class OperationType {
+  BARRIER,
+  PUT,
+  GET,
+  COPY,
+  SIGNAL,
+  WAIT,
+  FLUSH,
+  REDUCE,
+  READ_REDUCE_COPY,
+  READ_REDUCE_COPY_PUT,
+};
+
+enum class ChannelType {
+  SM,
+  PROXY,
+};
+
+struct Channels {
+  mscclpp::DeviceHandle<mscclpp::SmChannel> smChannels[MAX_CHANNEL];
+  mscclpp::DeviceHandle<mscclpp::ProxyChannel> proxyChannels[MAX_CHANNEL];
+};
+
+struct Operation {
+  OperationType type;
+  ChannelType channelType;
+  uint16_t inputChannelIndex[MAX_CHANNEL_PER_OPERATION];
+  uint16_t outputChannelIndex[MAX_CHANNEL_PER_OPERATION];
+  size_t inputOffset[MAX_CHANNEL_PER_OPERATION];
+  size_t outputOffset[MAX_CHANNEL_PER_OPERATION];
+  size_t srcOffset;
+  size_t dstOffset;
+  size_t size;
+};
+
+struct DeviceExecutionPlan {
+  int nSmChannels;
+  int nProxyChannels;
+  Channels channels;
+  int nOperations;
+  Operation operations[1];
+};
+
+class ExectionPlan {
+ public:
+  ExectionPlan();
+  void loadExecutionPlan(const std::string& json);
+  ~ExectionPlan();
+};
+
+}  // namespace mscclpp
+
+#endif  // MSCCLPP_EXECUTOR_PLAN_HPP_
diff --git a/src/include/executor.hpp b/src/include/executor.hpp
new file mode 100644
index 000000000..788b09e9e
--- /dev/null
+++ b/src/include/executor.hpp
@@ -0,0 +1,38 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef MSCCLPP_EXECUTOR_HPP_
+#define MSCCLPP_EXECUTOR_HPP_
+
+#include <memory>
+#include <string>
+
+#include "execution_plan.hpp"
+
+namespace mscclpp {
+
+class Executor {
+ public:
+  Executor();
+  template <typename T>
+  void execute(std::shared_ptr<T> sendbuff, std::shared_ptr<T> recvBuff, size_t sendBuffSize, size_t recvBuffSize,
+               const ExectionPlan& plan);
+  ~Executor();
+
+ private:
+  struct Impl;
+
+  std::shared_ptr<Impl> impl_;
+};
+
+struct Executor::Impl {
+  Impl();
+  void setupCommnucation(void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize,
+                         const ExectionPlan& plan);
+  void launchKernel();
+  ~Impl();
+};
+
+}  // namespace mscclpp
+
+#endif  // MSCCLPP_EXECUTOR_HPP_

From c3e0e022f4f467e9ff2e4ccc26b18c141d1c3417 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Fri, 29 Mar 2024 11:38:31 +0000
Subject: [PATCH 03/51] WIP

---
 CMakeLists.txt                   |  6 +++++-
 src/executor/execution_kernel.cu | 10 ++++++++--
 src/executor/execution_plan.cpp  | 10 ++++++++++
 src/include/execution_plan.hpp   | 32 +++++++++++++++++++++++++++-----
 src/include/executor.hpp         |  2 +-
 5 files changed, 51 insertions(+), 9 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ccddb366b..865ab81f5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -99,13 +99,17 @@ find_package(IBVerbs REQUIRED)
 find_package(NUMA REQUIRED)
 find_package(Threads REQUIRED)
 
+include(FetchContent)
+FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz)
+FetchContent_MakeAvailable(json)
+
 add_library(mscclpp_obj OBJECT)
 target_include_directories(mscclpp_obj
     SYSTEM PRIVATE
     ${GPU_INCLUDE_DIRS}
     ${IBVERBS_INCLUDE_DIRS}
     ${NUMA_INCLUDE_DIRS})
-target_link_libraries(mscclpp_obj PRIVATE ${GPU_LIBRARIES} ${NUMA_LIBRARIES} ${IBVERBS_LIBRARIES} Threads::Threads)
+target_link_libraries(mscclpp_obj PRIVATE ${GPU_LIBRARIES} ${NUMA_LIBRARIES} ${IBVERBS_LIBRARIES} nlohmann_json::nlohmann_json Threads::Threads)
 set_target_properties(mscclpp_obj PROPERTIES LINKER_LANGUAGE CXX POSITION_INDEPENDENT_CODE 1 VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
 if(USE_CUDA)
     target_compile_definitions(mscclpp_obj PRIVATE USE_CUDA)
diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu
index f7b79315e..cecf0605e 100644
--- a/src/executor/execution_kernel.cu
+++ b/src/executor/execution_kernel.cu
@@ -5,13 +5,19 @@
 
 extern __shared__ mscclpp::DeviceExecutionPlan sharedMem[];
 
-__global__ void commnuication_kernel(void* sendbuff, void* recvbuff, void* scratchbuff) {
+__global__ void commnuication_kernel(void* sendbuff, void* recvbuff, void* scratchbuff, size_t chunkSize) {
   // read data from shared memory
   // 1. get the number of command from shared memory
   int nOps = sharedMem->nOperations;
-  for (int opId= 0; opId < nOps; opId++) {
+  mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannel = sharedMem->channels.smChannels;
+  mscclpp::DeviceHandle<mscclpp::ProxyChannel>* proxyChannel = sharedMem->channels.proxyChannels;
+  for (int opId = 0; opId < nOps; opId++) {
     // 2. get the command
     mscclpp::Operation* op = sharedMem->operations + opId;
     // 3. execute the command
+    switch (op->type) {
+      default:
+        break;
+    }
   }
 }
diff --git a/src/executor/execution_plan.cpp b/src/executor/execution_plan.cpp
index 048847f54..1a9278cd2 100644
--- a/src/executor/execution_plan.cpp
+++ b/src/executor/execution_plan.cpp
@@ -1,3 +1,13 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
+#include "execution_plan.hpp"
+
+#include <nlohmann/json.hpp>
+
+namespace mscclpp {
+using json = nlohmann::json;
+void ExecutionPlan::loadExecutionPlan(std::ifstream& file) {
+    json obj = json::parse(file);
+}
+}  // namespace mscclpp
diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp
index aa964ae1a..368ccdc13 100644
--- a/src/include/execution_plan.hpp
+++ b/src/include/execution_plan.hpp
@@ -33,6 +33,19 @@ enum class ChannelType {
   PROXY,
 };
 
+enum class BufferType {
+  INPUT,
+  OUTPUT,
+  SCRATCH,
+};
+
+struct ChannelInfo {
+  BufferType srcBufferType;
+  BufferType dstBufferType;
+  ChannelType channelType;
+  std::vector<int> connectedPeers;
+};
+
 struct Channels {
   mscclpp::DeviceHandle<mscclpp::SmChannel> smChannels[MAX_CHANNEL];
   mscclpp::DeviceHandle<mscclpp::ProxyChannel> proxyChannels[MAX_CHANNEL];
@@ -53,16 +66,25 @@ struct Operation {
 struct DeviceExecutionPlan {
   int nSmChannels;
   int nProxyChannels;
-  Channels channels;
   int nOperations;
+  Channels channels;
   Operation operations[1];
 };
 
-class ExectionPlan {
+class ExecutionPlan {
  public:
-  ExectionPlan();
-  void loadExecutionPlan(const std::string& json);
-  ~ExectionPlan();
+  ExecutionPlan();
+  void loadExecutionPlan(std::ifstream& file);
+  std::vector<int> getConnectedPeers(int rank);
+  size_t getScratchSize(size_t inputSize);
+  std::vector<Operation> getOperations(int rank, int threadblock);
+  std::pair<int, int> getThreadBlockChannelRange(int rank, int threadblock, BufferType srcBufferType,
+                                                 BufferType dstBufferType, ChannelType channelType);
+  ~ExecutionPlan();
+
+ private:
+  // operations for [rank][threadblock]
+  std::vector<std::vector<Operation>> operations_;
 };
 
 }  // namespace mscclpp
diff --git a/src/include/executor.hpp b/src/include/executor.hpp
index 788b09e9e..647e55193 100644
--- a/src/include/executor.hpp
+++ b/src/include/executor.hpp
@@ -28,7 +28,7 @@ class Executor {
 struct Executor::Impl {
   Impl();
   void setupCommnucation(void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize,
-                         const ExectionPlan& plan);
+                         const ExecutionPlan& plan);
   void launchKernel();
   ~Impl();
 };

From 91d4df203b5887f8a554f9f7c10bed180d31a973 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Sun, 31 Mar 2024 08:11:14 +0000
Subject: [PATCH 04/51] WIP

---
 src/executor/executor.cc       | 60 +++++++++++++++++++++++++++++++++-
 src/include/execution_plan.hpp | 11 +++++--
 src/include/executor.hpp       | 44 ++++++++++++++++++++++---
 3 files changed, 106 insertions(+), 9 deletions(-)

diff --git a/src/executor/executor.cc b/src/executor/executor.cc
index de9e8ebd7..96889092a 100644
--- a/src/executor/executor.cc
+++ b/src/executor/executor.cc
@@ -5,9 +5,67 @@
 
 #include "executor.hpp"
 
+namespace {
+static const mscclpp::Transport IBs[] = {mscclpp::Transport::IB0, mscclpp::Transport::IB1, mscclpp::Transport::IB2,
+                                         mscclpp::Transport::IB3, mscclpp::Transport::IB4, mscclpp::Transport::IB5,
+                                         mscclpp::Transport::IB6, mscclpp::Transport::IB7};
+}  // namespace
+
 namespace mscclpp {
 
-void Executor::Impl::launchKernel() {
+ExecutionContext Executor::Impl::setupExecutionContext(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize,
+                                                       size_t recvBuffSize, const ExecutionPlan& plan) {
+  ExecutionPlanKey key = {sendbuff, recvBuff, sendBuffSize, recvBuffSize, plan.getName()};
+  if (this->contexts.find(key) != this->contexts.end()) {
+    return this->contexts[key];
+  }
+  ExecutionContext context;
+  size_t scratchBufferSize = plan.getScratchBufferSize(rank, sendBuffSize);
+  std::shared_ptr<char> scratchBuffer = allocExtSharedCuda<char>(scratchBufferSize);
+  context.scratchBuffer = scratchBuffer;
+
+  std::vector<BufferType> bufferTypes = plan.getConnectedBufferTypes(rank, ChannelType::SM);
+  int nranksPerNode = plan.nranksPerNode();
+  auto getTransportFlags = [&](std::vector<ChannelInfo>& infos, int rank) -> mscclpp::TransportFlags {
+    return mscclpp::Transport::CudaIpc;
+  };
+  auto getBufferInfo = [&](BufferType type) {
+    switch (type) {
+      case BufferType::INPUT:
+        return std::make_pair(sendbuff, sendBuffSize);
+      case BufferType::OUTPUT:
+        return std::make_pair(recvBuff, recvBuffSize);
+      case BufferType::SCRATCH:
+        return std::make_pair((void*)scratchBuffer.get(), scratchBufferSize);
+      default:
+        throw std::runtime_error("Invalid buffer type");
+    }
+  };
+  auto getConnectedPeers = [&](std::vector<ChannelInfo>& infos) {
+    std::vector<int> peers;
+    return peers;
+  };
+
+  for (BufferType bufferType : bufferTypes) {
+    std::vector<ChannelInfo> channelInfos = plan.getChannelInfos(rank, bufferType);
+    mscclpp::TransportFlags transportFlags = getTransportFlags(channelInfos, rank);
+    mscclpp::RegisteredMemory memory =
+        this->comm->registerMemory(getBufferInfo(bufferType).first, getBufferInfo(bufferType).second, transportFlags);
+    std::vector<int> connectedPeers = getConnectedPeers(channelInfos);
+    std::vector<mscclpp::NonblockingFuture<mscclpp::RegisteredMemory>> remoteRegMemoryFutures;
+    for (int peer : connectedPeers) {
+      remoteRegMemoryFutures.push_back(comm->recvMemoryOnSetup(peer, 0));
+      comm->sendMemoryOnSetup(memory, peer, 0);
+    }
+    comm->setup();
+    for (int i = 0; i < remoteRegMemoryFutures.size(); i++) {
+      context.registeredMemories[{bufferType, connectedPeers[i]}].push_back(remoteRegMemoryFutures[i].get());
+    }
+  }
+  std::vector<ChannelInfo> smChannelInfos = plan.getChannelInfos(rank, ChannelType::SM);
+  return context;
 }
 
+void Executor::Impl::launchKernel() {}
+
 }  // namespace mscclpp
diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp
index 368ccdc13..66b3297a1 100644
--- a/src/include/execution_plan.hpp
+++ b/src/include/execution_plan.hpp
@@ -73,10 +73,14 @@ struct DeviceExecutionPlan {
 
 class ExecutionPlan {
  public:
-  ExecutionPlan();
+  ExecutionPlan(std::string name);
+  std::string getName() const;
   void loadExecutionPlan(std::ifstream& file);
-  std::vector<int> getConnectedPeers(int rank);
-  size_t getScratchSize(size_t inputSize);
+  int nranksPerNode() const;
+  std::vector<ChannelInfo> getChannelInfos(int rank, ChannelType channelType) const;
+  std::vector<ChannelInfo> getChannelInfos(int rank, BufferType bufferType) const;
+  std::vector<BufferType> getConnectedBufferTypes(int rank, ChannelType channelType) const;
+  size_t getScratchBufferSize(int rank, size_t inputSize) const;
   std::vector<Operation> getOperations(int rank, int threadblock);
   std::pair<int, int> getThreadBlockChannelRange(int rank, int threadblock, BufferType srcBufferType,
                                                  BufferType dstBufferType, ChannelType channelType);
@@ -85,6 +89,7 @@ class ExecutionPlan {
  private:
   // operations for [rank][threadblock]
   std::vector<std::vector<Operation>> operations_;
+  std::string name_;
 };
 
 }  // namespace mscclpp
diff --git a/src/include/executor.hpp b/src/include/executor.hpp
index 647e55193..0b6c379a7 100644
--- a/src/include/executor.hpp
+++ b/src/include/executor.hpp
@@ -4,8 +4,10 @@
 #ifndef MSCCLPP_EXECUTOR_HPP_
 #define MSCCLPP_EXECUTOR_HPP_
 
+#include <functional>
 #include <memory>
 #include <string>
+#include <unordered_map>
 
 #include "execution_plan.hpp"
 
@@ -13,10 +15,10 @@ namespace mscclpp {
 
 class Executor {
  public:
-  Executor();
+  Executor(const std::unordered_map<int, mscclpp::Connection> connections);
   template <typename T>
   void execute(std::shared_ptr<T> sendbuff, std::shared_ptr<T> recvBuff, size_t sendBuffSize, size_t recvBuffSize,
-               const ExectionPlan& plan);
+               const ExecutionPlan& plan);
   ~Executor();
 
  private:
@@ -25,14 +27,46 @@ class Executor {
   std::shared_ptr<Impl> impl_;
 };
 
+struct ExecutionContext {
+  std::unordered_map<std::pair<BufferType, int>, std::vector<mscclpp::RegisteredMemory>> registeredMemories;
+  std::vector<mscclpp::SmChannel> smChannels;
+  std::vector<mscclpp::ProxyChannel> proxyChannels;
+  std::vector<DeviceExecutionPlan> deviceExecutionPlans;
+  std::shared_ptr<char> scratchBuffer;
+};
+
+struct ExecutionPlanKey {
+  void* sendBuff;
+  void* recvBuff;
+  size_t sendBuffSize;
+  size_t recvBuffSize;
+  std::string plan;
+};
+
 struct Executor::Impl {
-  Impl();
-  void setupCommnucation(void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize,
-                         const ExecutionPlan& plan);
+  std::unordered_map<ExecutionPlanKey, ExecutionContext> contexts;
+  const std::unordered_map<int, mscclpp::Connection> connections;
+  std::shared_ptr<mscclpp::Communicator> comm;
+
+  Impl(const std::unordered_map<int, mscclpp::Connection> connections);
+  ExecutionContext setupExecutionContext(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize,
+                                         size_t recvBuffSize, const ExecutionPlan& plan);
+  void setupRegisteredMemories(ExecutionContext& context, int rank, const ExecutionPlan& plan);
+  void setupChannels(ExecutionContext& context, int rank, const ExecutionPlan& plan);
   void launchKernel();
   ~Impl();
 };
 
 }  // namespace mscclpp
 
+namespace std {
+template <>
+struct hash<mscclpp::ExecutionPlanKey> {
+  std::size_t operator()(const mscclpp::ExecutionPlanKey& key) const {
+    return std::hash<void*>()(key.sendBuff) ^ std::hash<void*>()(key.recvBuff) ^ std::hash<size_t>()(key.sendBuffSize) ^
+           std::hash<size_t>()(key.recvBuffSize) ^ std::hash<std::string>()(key.plan);
+  }
+};
+}  // namespace std
+
 #endif  // MSCCLPP_EXECUTOR_HPP_

From d6ec1439dba739212a65f358df2da971b198c35f Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Mon, 1 Apr 2024 07:51:33 +0000
Subject: [PATCH 05/51] WIP

---
 src/executor/executor.cc       | 118 +++++++++++++++++++++++++++------
 src/include/execution_plan.hpp |   4 +-
 src/include/executor.hpp       |  70 +++++++++++--------
 3 files changed, 141 insertions(+), 51 deletions(-)

diff --git a/src/executor/executor.cc b/src/executor/executor.cc
index 96889092a..2d0650cec 100644
--- a/src/executor/executor.cc
+++ b/src/executor/executor.cc
@@ -1,10 +1,10 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-// This is used for execute json file generated by msccl scheduler
-
 #include "executor.hpp"
 
+#include <set>
+
 namespace {
 static const mscclpp::Transport IBs[] = {mscclpp::Transport::IB0, mscclpp::Transport::IB1, mscclpp::Transport::IB2,
                                          mscclpp::Transport::IB3, mscclpp::Transport::IB4, mscclpp::Transport::IB5,
@@ -13,59 +13,135 @@ static const mscclpp::Transport IBs[] = {mscclpp::Transport::IB0, mscclpp::Trans
 
 namespace mscclpp {
 
-ExecutionContext Executor::Impl::setupExecutionContext(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize,
-                                                       size_t recvBuffSize, const ExecutionPlan& plan) {
-  ExecutionPlanKey key = {sendbuff, recvBuff, sendBuffSize, recvBuffSize, plan.getName()};
+ExecutionContext Executor::Impl::setupExecutionContext(int rank, void* sendbuff, void* recvbuff, size_t sendBufferSize,
+                                                       size_t recvBufferSize, const ExecutionPlan& plan) {
+  ExecutionContextKey key = {sendbuff, recvbuff, sendBufferSize, recvBufferSize, plan.getName()};
   if (this->contexts.find(key) != this->contexts.end()) {
     return this->contexts[key];
   }
   ExecutionContext context;
-  size_t scratchBufferSize = plan.getScratchBufferSize(rank, sendBuffSize);
+  size_t scratchBufferSize = plan.getScratchBufferSize(rank, sendBufferSize);
   std::shared_ptr<char> scratchBuffer = allocExtSharedCuda<char>(scratchBufferSize);
   context.scratchBuffer = scratchBuffer;
+  context.scratchBufferSize = scratchBufferSize;
+  this->setupRegisteredMemories(context, sendbuff, recvbuff, sendBufferSize, recvBufferSize, rank, plan);
+  this->setupChannels(context, sendbuff, recvbuff, rank, plan);
+  return context;
+}
 
-  std::vector<BufferType> bufferTypes = plan.getConnectedBufferTypes(rank, ChannelType::SM);
+void Executor::Impl::setupRegisteredMemories(ExecutionContext& context, void* sendbuff, void* recvbuff,
+                                             size_t sendBufferSize, size_t recvBufferSize, int rank,
+                                             const ExecutionPlan& plan) {
   int nranksPerNode = plan.nranksPerNode();
-  auto getTransportFlags = [&](std::vector<ChannelInfo>& infos, int rank) -> mscclpp::TransportFlags {
-    return mscclpp::Transport::CudaIpc;
+  auto getTransportFlags = [&](std::vector<ChannelInfo>& infos, int rank) {
+    TransportFlags flags;
+    for (ChannelInfo& info : infos) {
+      if (info.channelType == ChannelType::SM) {
+        flags |= Transport::CudaIpc;
+      } else if (info.channelType == ChannelType::PROXY) {
+        flags |= IBs[rank % nranksPerNode];
+      }
+    }
+    return flags;
   };
   auto getBufferInfo = [&](BufferType type) {
     switch (type) {
       case BufferType::INPUT:
-        return std::make_pair(sendbuff, sendBuffSize);
+        return std::make_pair(sendbuff, sendBufferSize);
       case BufferType::OUTPUT:
-        return std::make_pair(recvBuff, recvBuffSize);
+        return std::make_pair(recvbuff, recvBufferSize);
       case BufferType::SCRATCH:
-        return std::make_pair((void*)scratchBuffer.get(), scratchBufferSize);
+        return std::make_pair((void*)context.scratchBuffer.get(), context.scratchBufferSize);
       default:
         throw std::runtime_error("Invalid buffer type");
     }
   };
   auto getConnectedPeers = [&](std::vector<ChannelInfo>& infos) {
-    std::vector<int> peers;
-    return peers;
+    std::set<int> peers;
+    for (ChannelInfo& info : infos) {
+      for (int peer : info.connectedPeers) {
+        peers.insert(peer);
+      }
+    }
+    return std::vector<int>(peers.begin(), peers.end());
   };
 
+  std::vector<BufferType> bufferTypes = plan.getConnectedBufferTypes(rank);
   for (BufferType bufferType : bufferTypes) {
     std::vector<ChannelInfo> channelInfos = plan.getChannelInfos(rank, bufferType);
-    mscclpp::TransportFlags transportFlags = getTransportFlags(channelInfos, rank);
-    mscclpp::RegisteredMemory memory =
+    TransportFlags transportFlags = getTransportFlags(channelInfos, rank);
+    RegisteredMemory memory =
         this->comm->registerMemory(getBufferInfo(bufferType).first, getBufferInfo(bufferType).second, transportFlags);
     std::vector<int> connectedPeers = getConnectedPeers(channelInfos);
     std::vector<mscclpp::NonblockingFuture<mscclpp::RegisteredMemory>> remoteRegMemoryFutures;
     for (int peer : connectedPeers) {
-      remoteRegMemoryFutures.push_back(comm->recvMemoryOnSetup(peer, 0));
       comm->sendMemoryOnSetup(memory, peer, 0);
+      remoteRegMemoryFutures.push_back(comm->recvMemoryOnSetup(peer, 0));
     }
     comm->setup();
     for (int i = 0; i < remoteRegMemoryFutures.size(); i++) {
-      context.registeredMemories[{bufferType, connectedPeers[i]}].push_back(remoteRegMemoryFutures[i].get());
+      context.registeredMemories[{bufferType, connectedPeers[i]}] = std::move(remoteRegMemoryFutures[i].get());
+    }
+  }
+}
+
+void Executor::Impl::setupChannels(ExecutionContext& context, void* sendbuff, void* recvbuff, size_t sendBufferSize,
+                                   int rank, const ExecutionPlan& plan) {
+  const auto channelTypes = {ChannelType::SM, ChannelType::PROXY};
+  std::vector<std::shared_ptr<SmDevice2DeviceSemaphore>> smSemaphores;
+  std::vector<mscclpp::SemaphoreId> proxySemaphores;
+  for (ChannelType channelType : channelTypes) {
+    std::vector<ChannelInfo> channelInfos = plan.getChannelInfos(rank, channelType);
+    for (ChannelInfo& info : channelInfos) {
+      for (int peer : info.connectedPeers) {
+        if (channelType == ChannelType::SM) {
+          smSemaphores.push_back(
+              std::make_shared<SmDevice2DeviceSemaphore>(*this->comm, this->connections.at(peer)));
+        } else if (channelType == ChannelType::PROXY) {
+          proxySemaphores.push_back(this->proxyService->buildAndAddSemaphore(*this->comm, this->connections.at(peer)));
+        }
+      }
+    }
+  }
+  this->comm->setup();
+  context.smSemaphores = std::move(smSemaphores);
+  context.proxySemaphores = std::move(proxySemaphores);
+
+  auto getBuffer = [&](BufferType type) {
+    switch (type) {
+      case BufferType::INPUT:
+        return sendbuff;
+      case BufferType::OUTPUT:
+        return recvbuff;
+      case BufferType::SCRATCH:
+        return (void*)context.scratchBuffer.get();
+      default:
+        throw std::runtime_error("Invalid buffer type");
+    }
+  };
+  for (ChannelType channelType : channelTypes) {
+    std::vector<ChannelInfo> channelInfos = plan.getChannelInfos(rank, channelType);
+    int index = 0;
+    for (ChannelInfo& info : channelInfos) {
+      void* src = getBuffer(info.srcBufferType);
+      void* dst = getBuffer(info.dstBufferType);
+      TransportFlags transport = context.registeredMemories.begin()->second.transports();
+      RegisteredMemory localMemory = this->comm->registerMemory(src, sendBufferSize, transport);
+      for (int peer : info.connectedPeers) {
+        if (channelType == ChannelType::SM) {
+          context.smChannels.emplace_back(smSemaphores[index], context.registeredMemories[{info.dstBufferType, peer}],
+                                          src, nullptr);
+        } else if (channelType == ChannelType::PROXY) {
+          context.proxyChannels.emplace_back(
+              this->proxyService->proxyChannel(proxySemaphores[index]),
+              this->proxyService->addMemory(context.registeredMemories[{info.dstBufferType, peer}]),
+              this->proxyService->addMemory(localMemory));
+        }
+      }
     }
   }
-  std::vector<ChannelInfo> smChannelInfos = plan.getChannelInfos(rank, ChannelType::SM);
-  return context;
 }
 
-void Executor::Impl::launchKernel() {}
+void Executor::Impl::launchKernel(ExecutionContext& context) {}
 
 }  // namespace mscclpp
diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp
index 66b3297a1..8b4342c8f 100644
--- a/src/include/execution_plan.hpp
+++ b/src/include/execution_plan.hpp
@@ -48,7 +48,7 @@ struct ChannelInfo {
 
 struct Channels {
   mscclpp::DeviceHandle<mscclpp::SmChannel> smChannels[MAX_CHANNEL];
-  mscclpp::DeviceHandle<mscclpp::ProxyChannel> proxyChannels[MAX_CHANNEL];
+  mscclpp::DeviceHandle<mscclpp::SimpleProxyChannel> proxyChannels[MAX_CHANNEL];
 };
 
 struct Operation {
@@ -79,7 +79,7 @@ class ExecutionPlan {
   int nranksPerNode() const;
   std::vector<ChannelInfo> getChannelInfos(int rank, ChannelType channelType) const;
   std::vector<ChannelInfo> getChannelInfos(int rank, BufferType bufferType) const;
-  std::vector<BufferType> getConnectedBufferTypes(int rank, ChannelType channelType) const;
+  std::vector<BufferType> getConnectedBufferTypes(int rank) const;
   size_t getScratchBufferSize(int rank, size_t inputSize) const;
   std::vector<Operation> getOperations(int rank, int threadblock);
   std::pair<int, int> getThreadBlockChannelRange(int rank, int threadblock, BufferType srcBufferType,
diff --git a/src/include/executor.hpp b/src/include/executor.hpp
index 0b6c379a7..66518388a 100644
--- a/src/include/executor.hpp
+++ b/src/include/executor.hpp
@@ -12,7 +12,33 @@
 #include "execution_plan.hpp"
 
 namespace mscclpp {
+struct ExecutionContextKey {
+  void* sendBuff;
+  void* recvBuff;
+  size_t sendBuffSize;
+  size_t recvBuffSize;
+  std::string plan;
+};
+}  // namespace mscclpp
+
+namespace std {
+template <>
+struct hash<std::pair<mscclpp::BufferType, int>> {
+  std::size_t operator()(const std::pair<mscclpp::BufferType, int>& key) const {
+    return std::hash<int>()(key.second) ^ std::hash<int>()(static_cast<int>(key.first));
+  }
+};
 
+template <>
+struct hash<mscclpp::ExecutionContextKey> {
+  std::size_t operator()(const mscclpp::ExecutionContextKey& key) const {
+    return std::hash<void*>()(key.sendBuff) ^ std::hash<void*>()(key.recvBuff) ^ std::hash<size_t>()(key.sendBuffSize) ^
+           std::hash<size_t>()(key.recvBuffSize) ^ std::hash<std::string>()(key.plan);
+  }
+};
+}  // namespace std
+
+namespace mscclpp {
 class Executor {
  public:
   Executor(const std::unordered_map<int, mscclpp::Connection> connections);
@@ -28,45 +54,33 @@ class Executor {
 };
 
 struct ExecutionContext {
-  std::unordered_map<std::pair<BufferType, int>, std::vector<mscclpp::RegisteredMemory>> registeredMemories;
+  std::unordered_map<std::pair<BufferType, int>, mscclpp::RegisteredMemory> registeredMemories;
+  std::vector<std::shared_ptr<mscclpp::SmDevice2DeviceSemaphore>> smSemaphores;
+  std::vector<mscclpp::SemaphoreId> proxySemaphores;
   std::vector<mscclpp::SmChannel> smChannels;
-  std::vector<mscclpp::ProxyChannel> proxyChannels;
+  std::vector<mscclpp::SimpleProxyChannel> proxyChannels;
   std::vector<DeviceExecutionPlan> deviceExecutionPlans;
   std::shared_ptr<char> scratchBuffer;
-};
-
-struct ExecutionPlanKey {
-  void* sendBuff;
-  void* recvBuff;
-  size_t sendBuffSize;
-  size_t recvBuffSize;
-  std::string plan;
+  size_t scratchBufferSize;
 };
 
 struct Executor::Impl {
-  std::unordered_map<ExecutionPlanKey, ExecutionContext> contexts;
-  const std::unordered_map<int, mscclpp::Connection> connections;
+  std::unordered_map<ExecutionContextKey, ExecutionContext> contexts;
+  const std::unordered_map<int, std::shared_ptr<mscclpp::Connection>> connections;
   std::shared_ptr<mscclpp::Communicator> comm;
+  std::shared_ptr<mscclpp::ProxyService> proxyService;
 
   Impl(const std::unordered_map<int, mscclpp::Connection> connections);
-  ExecutionContext setupExecutionContext(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize,
-                                         size_t recvBuffSize, const ExecutionPlan& plan);
-  void setupRegisteredMemories(ExecutionContext& context, int rank, const ExecutionPlan& plan);
-  void setupChannels(ExecutionContext& context, int rank, const ExecutionPlan& plan);
-  void launchKernel();
-  ~Impl();
+  ExecutionContext setupExecutionContext(int rank, void* sendbuff, void* recvbuff, size_t sendBufferSize,
+                                         size_t recvBufferSize, const ExecutionPlan& plan);
+  void setupRegisteredMemories(ExecutionContext& context, void* sendbuff, void* recvbuff, size_t sendBufferSize,
+                               size_t recvBufferSize, int rank, const ExecutionPlan& plan);
+  void setupChannels(ExecutionContext& context, void* sendbuff, void* recvbuff, size_t sendBufferSize, int rank,
+                     const ExecutionPlan& plan);
+  void launchKernel(ExecutionContext& context);
+  ~Impl() = default;
 };
 
 }  // namespace mscclpp
 
-namespace std {
-template <>
-struct hash<mscclpp::ExecutionPlanKey> {
-  std::size_t operator()(const mscclpp::ExecutionPlanKey& key) const {
-    return std::hash<void*>()(key.sendBuff) ^ std::hash<void*>()(key.recvBuff) ^ std::hash<size_t>()(key.sendBuffSize) ^
-           std::hash<size_t>()(key.recvBuffSize) ^ std::hash<std::string>()(key.plan);
-  }
-};
-}  // namespace std
-
 #endif  // MSCCLPP_EXECUTOR_HPP_

From 3fdd602c7a79d30f71099dfd540d7fd758668247 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Mon, 1 Apr 2024 09:45:29 +0000
Subject: [PATCH 06/51] WIP

---
 src/executor/execution_plan.cpp | 68 ++++++++++++++++++++++++++++++++-
 src/executor/executor.cc        | 26 +++++++++++--
 src/include/execution_plan.hpp  | 11 ++++--
 src/include/executor.hpp        | 16 ++++----
 4 files changed, 103 insertions(+), 18 deletions(-)

diff --git a/src/executor/execution_plan.cpp b/src/executor/execution_plan.cpp
index 1a9278cd2..a4fa11d25 100644
--- a/src/executor/execution_plan.cpp
+++ b/src/executor/execution_plan.cpp
@@ -5,9 +5,75 @@
 
 #include <nlohmann/json.hpp>
 
+namespace {
+template <typename T, typename Predicate>
+std::vector<T> filter(const std::vector<T>& vec, Predicate pred) {
+  std::vector<T> filtered;
+  std::copy_if(vec.begin(), vec.end(), std::back_inserter(filtered), pred);
+  return filtered;
+}
+}  // namespace
+
 namespace mscclpp {
 using json = nlohmann::json;
+
+ExecutionPlan::ExecutionPlan(std::ifstream& file) { this->loadExecutionPlan(file); }
+
+std::string ExecutionPlan::getName() const { return this->name_; }
+
+int ExecutionPlan::nranksPerNode() const { return this->nranksPerNode_; }
+
+std::vector<ChannelInfo> ExecutionPlan::getChannelInfos(int rank, ChannelType channelType) const {
+  auto pred = [channelType](const ChannelInfo& info) { return info.channelType == channelType; };
+  return filter(this->channelInfos_.at(rank), pred);
+}
+
+std::vector<ChannelInfo> ExecutionPlan::getChannelInfos(int rank, BufferType dstBufferType) const {
+  auto pred = [dstBufferType](const ChannelInfo& info) { return info.dstBufferType == dstBufferType; };
+  return filter(this->channelInfos_.at(rank), pred);
+}
+
 void ExecutionPlan::loadExecutionPlan(std::ifstream& file) {
-    json obj = json::parse(file);
+  auto convertToBufferType = [](const std::string& str) {
+    if (str == "input") {
+      return BufferType::INPUT;
+    } else if (str == "output") {
+      return BufferType::OUTPUT;
+    } else if (str == "scratch") {
+      return BufferType::SCRATCH;
+    } else {
+      throw std::runtime_error("Invalid buffer type");
+    }
+  };
+  auto convertToChannelType = [](const std::string& str) {
+    if (str == "sm") {
+      return ChannelType::SM;
+    } else if (str == "proxy") {
+      return ChannelType::PROXY;
+    } else {
+      throw std::runtime_error("Invalid channel type");
+    }
+  };
+
+  json obj = json::parse(file);
+  this->name_ = obj["name"];
+  this->nranksPerNode_ = obj["nranksPerNode"];
+  auto gpus = obj["gpus"];
+  for (const auto& gpu : gpus) {
+    int rank = gpu["rank"];
+    std::vector<ChannelInfo> channelInfos;
+    for (const auto& channel : gpu["channels"]) {
+      ChannelInfo info;
+      info.srcBufferType = convertToBufferType(channel["srcBuffer"]);
+      info.dstBufferType = convertToBufferType(channel["dstBuffer"]);
+      info.channelType = convertToChannelType(channel["type"]);
+      for (const auto& peer : channel["connectedTo"]) {
+        info.connectedPeers.push_back(peer);
+      }
+      channelInfos.push_back(info);
+    }
+    this->channelInfos_[rank] = channelInfos;
+  }
 }
+
 }  // namespace mscclpp
diff --git a/src/executor/executor.cc b/src/executor/executor.cc
index 2d0650cec..f17808fa7 100644
--- a/src/executor/executor.cc
+++ b/src/executor/executor.cc
@@ -13,6 +13,22 @@ static const mscclpp::Transport IBs[] = {mscclpp::Transport::IB0, mscclpp::Trans
 
 namespace mscclpp {
 
+Executor::Executor(std::shared_ptr<Communicator> comm, const std::unordered_map<int, mscclpp::Connection> connections)
+    : impl_(std::make_shared<Impl>(comm, connections)) {}
+
+void Executor::execute(void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize,
+                       const ExecutionPlan& plan) {
+  ExecutionContext context =
+      this->impl_->setupExecutionContext(0, sendbuff, recvBuff, sendBuffSize, recvBuffSize, plan);
+  this->impl_->launchKernel(context);
+}
+
+Executor::Impl::Impl(std::shared_ptr<Communicator> comm,
+                     const std::unordered_map<int, std::shared_ptr<Connection>> connections)
+    : comm(comm), connections(connections) {
+  this->proxyService = std::make_shared<ProxyService>();
+}
+
 ExecutionContext Executor::Impl::setupExecutionContext(int rank, void* sendbuff, void* recvbuff, size_t sendBufferSize,
                                                        size_t recvBufferSize, const ExecutionPlan& plan) {
   ExecutionContextKey key = {sendbuff, recvbuff, sendBufferSize, recvBufferSize, plan.getName()};
@@ -25,7 +41,7 @@ ExecutionContext Executor::Impl::setupExecutionContext(int rank, void* sendbuff,
   context.scratchBuffer = scratchBuffer;
   context.scratchBufferSize = scratchBufferSize;
   this->setupRegisteredMemories(context, sendbuff, recvbuff, sendBufferSize, recvBufferSize, rank, plan);
-  this->setupChannels(context, sendbuff, recvbuff, rank, plan);
+  this->setupChannels(context, sendbuff, recvbuff, sendBufferSize, rank, plan);
   return context;
 }
 
@@ -95,8 +111,7 @@ void Executor::Impl::setupChannels(ExecutionContext& context, void* sendbuff, vo
     for (ChannelInfo& info : channelInfos) {
       for (int peer : info.connectedPeers) {
         if (channelType == ChannelType::SM) {
-          smSemaphores.push_back(
-              std::make_shared<SmDevice2DeviceSemaphore>(*this->comm, this->connections.at(peer)));
+          smSemaphores.push_back(std::make_shared<SmDevice2DeviceSemaphore>(*this->comm, this->connections.at(peer)));
         } else if (channelType == ChannelType::PROXY) {
           proxySemaphores.push_back(this->proxyService->buildAndAddSemaphore(*this->comm, this->connections.at(peer)));
         }
@@ -142,6 +157,9 @@ void Executor::Impl::setupChannels(ExecutionContext& context, void* sendbuff, vo
   }
 }
 
-void Executor::Impl::launchKernel(ExecutionContext& context) {}
+void Executor::Impl::launchKernel(ExecutionContext& context) {
+  // Need to change to use flush function and make sure the proxy service will get the latest data.
+  this->proxyService->startProxy();
+}
 
 }  // namespace mscclpp
diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp
index 8b4342c8f..220cda9a6 100644
--- a/src/include/execution_plan.hpp
+++ b/src/include/execution_plan.hpp
@@ -7,8 +7,8 @@
 #include <mscclpp/core.hpp>
 #include <mscclpp/proxy_channel.hpp>
 #include <mscclpp/sm_channel.hpp>
-
 #include <string>
+#include <unordered_map>
 
 namespace mscclpp {
 
@@ -73,9 +73,8 @@ struct DeviceExecutionPlan {
 
 class ExecutionPlan {
  public:
-  ExecutionPlan(std::string name);
+  ExecutionPlan(std::ifstream& file);
   std::string getName() const;
-  void loadExecutionPlan(std::ifstream& file);
   int nranksPerNode() const;
   std::vector<ChannelInfo> getChannelInfos(int rank, ChannelType channelType) const;
   std::vector<ChannelInfo> getChannelInfos(int rank, BufferType bufferType) const;
@@ -84,12 +83,16 @@ class ExecutionPlan {
   std::vector<Operation> getOperations(int rank, int threadblock);
   std::pair<int, int> getThreadBlockChannelRange(int rank, int threadblock, BufferType srcBufferType,
                                                  BufferType dstBufferType, ChannelType channelType);
-  ~ExecutionPlan();
+  ~ExecutionPlan() = default;
 
  private:
+  void loadExecutionPlan(std::ifstream& file);
+
   // operations for [rank][threadblock]
   std::vector<std::vector<Operation>> operations_;
+  std::unordered_map<int, std::vector<ChannelInfo>> channelInfos_;
   std::string name_;
+  int nranksPerNode_;
 };
 
 }  // namespace mscclpp
diff --git a/src/include/executor.hpp b/src/include/executor.hpp
index 66518388a..88469416d 100644
--- a/src/include/executor.hpp
+++ b/src/include/executor.hpp
@@ -41,11 +41,9 @@ struct hash<mscclpp::ExecutionContextKey> {
 namespace mscclpp {
 class Executor {
  public:
-  Executor(const std::unordered_map<int, mscclpp::Connection> connections);
-  template <typename T>
-  void execute(std::shared_ptr<T> sendbuff, std::shared_ptr<T> recvBuff, size_t sendBuffSize, size_t recvBuffSize,
-               const ExecutionPlan& plan);
-  ~Executor();
+  Executor(std::shared_ptr<Communicator> comm, const std::unordered_map<int, mscclpp::Connection> connections);
+  void execute(void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, const ExecutionPlan& plan);
+  ~Executor() = default;
 
  private:
   struct Impl;
@@ -65,12 +63,12 @@ struct ExecutionContext {
 };
 
 struct Executor::Impl {
+  std::shared_ptr<Communicator> comm;
+  const std::unordered_map<int, std::shared_ptr<Connection>> connections;
+  std::shared_ptr<ProxyService> proxyService;
   std::unordered_map<ExecutionContextKey, ExecutionContext> contexts;
-  const std::unordered_map<int, std::shared_ptr<mscclpp::Connection>> connections;
-  std::shared_ptr<mscclpp::Communicator> comm;
-  std::shared_ptr<mscclpp::ProxyService> proxyService;
 
-  Impl(const std::unordered_map<int, mscclpp::Connection> connections);
+  Impl(std::shared_ptr<Communicator> comm, const std::unordered_map<int, std::shared_ptr<Connection>> connections);
   ExecutionContext setupExecutionContext(int rank, void* sendbuff, void* recvbuff, size_t sendBufferSize,
                                          size_t recvBufferSize, const ExecutionPlan& plan);
   void setupRegisteredMemories(ExecutionContext& context, void* sendbuff, void* recvbuff, size_t sendBufferSize,

From bf681b327f8f15f2840cb3d1cf8f7ab34c51c1a4 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Mon, 1 Apr 2024 11:31:40 +0000
Subject: [PATCH 07/51] compiled

---
 include/mscclpp/executor.hpp                  | 40 ++++++++
 src/executor/execution_kernel.cu              | 40 ++++----
 .../{execution_plan.cpp => execution_plan.cc} | 36 +++++---
 src/executor/executor.cc                      | 92 ++++++++++++++++---
 src/include/execution_plan.hpp                | 20 ++--
 src/include/executor.hpp                      | 84 -----------------
 6 files changed, 169 insertions(+), 143 deletions(-)
 create mode 100644 include/mscclpp/executor.hpp
 rename src/executor/{execution_plan.cpp => execution_plan.cc} (60%)
 delete mode 100644 src/include/executor.hpp

diff --git a/include/mscclpp/executor.hpp b/include/mscclpp/executor.hpp
new file mode 100644
index 000000000..c9f7e7f59
--- /dev/null
+++ b/include/mscclpp/executor.hpp
@@ -0,0 +1,40 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef MSCCLPP_EXECUTOR_HPP_
+#define MSCCLPP_EXECUTOR_HPP_
+
+#include <memory>
+#include <mscclpp/core.hpp>
+#include <unordered_map>
+
+namespace mscclpp {
+
+class ExecutionPlan {
+ public:
+  ExecutionPlan(std::ifstream& file);
+  ~ExecutionPlan() = default;
+
+ private:
+  struct Impl;
+  std::shared_ptr<Impl> impl_;
+
+  friend class Executor;
+};
+
+class Executor {
+ public:
+  Executor(std::shared_ptr<Communicator> comm, const std::unordered_map<int, std::shared_ptr<Connection>> connections);
+  Executor(const Executor&) = delete;
+  Executor& operator=(const Executor&) = delete;
+  ~Executor() = default;
+
+  void execute(void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, const ExecutionPlan& plan);
+
+ private:
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
+};
+}  // namespace mscclpp
+
+#endif  // MSCCLPP_EXECUTOR_HPP_
diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu
index cecf0605e..6b467dbcb 100644
--- a/src/executor/execution_kernel.cu
+++ b/src/executor/execution_kernel.cu
@@ -1,23 +1,23 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// // Copyright (c) Microsoft Corporation.
+// // Licensed under the MIT license.
 
-#include "execution_plan.hpp"
+// #include "execution_plan.hpp"
 
-extern __shared__ mscclpp::DeviceExecutionPlan sharedMem[];
+// extern __shared__ mscclpp::DeviceExecutionPlan sharedMem[];
 
-__global__ void commnuication_kernel(void* sendbuff, void* recvbuff, void* scratchbuff, size_t chunkSize) {
-  // read data from shared memory
-  // 1. get the number of command from shared memory
-  int nOps = sharedMem->nOperations;
-  mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannel = sharedMem->channels.smChannels;
-  mscclpp::DeviceHandle<mscclpp::ProxyChannel>* proxyChannel = sharedMem->channels.proxyChannels;
-  for (int opId = 0; opId < nOps; opId++) {
-    // 2. get the command
-    mscclpp::Operation* op = sharedMem->operations + opId;
-    // 3. execute the command
-    switch (op->type) {
-      default:
-        break;
-    }
-  }
-}
+// __global__ void commnuication_kernel(void* sendbuff, void* recvbuff, void* scratchbuff, size_t chunkSize) {
+//   // read data from shared memory
+//   // 1. get the number of command from shared memory
+//   int nOps = sharedMem->nOperations;
+//   mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannel = sharedMem->channels.smChannels;
+//   mscclpp::DeviceHandle<mscclpp::ProxyChannel>* proxyChannel = sharedMem->channels.proxyChannels;
+//   for (int opId = 0; opId < nOps; opId++) {
+//     // 2. get the command
+//     mscclpp::Operation* op = sharedMem->operations + opId;
+//     // 3. execute the command
+//     switch (op->type) {
+//       default:
+//         break;
+//     }
+//   }
+// }
diff --git a/src/executor/execution_plan.cpp b/src/executor/execution_plan.cc
similarity index 60%
rename from src/executor/execution_plan.cpp
rename to src/executor/execution_plan.cc
index a4fa11d25..5fe314cfc 100644
--- a/src/executor/execution_plan.cpp
+++ b/src/executor/execution_plan.cc
@@ -3,6 +3,7 @@
 
 #include "execution_plan.hpp"
 
+#include <fstream>
 #include <nlohmann/json.hpp>
 
 namespace {
@@ -17,23 +18,30 @@ std::vector<T> filter(const std::vector<T>& vec, Predicate pred) {
 namespace mscclpp {
 using json = nlohmann::json;
 
-ExecutionPlan::ExecutionPlan(std::ifstream& file) { this->loadExecutionPlan(file); }
+ExecutionPlan::Impl::Impl(std::ifstream& file) { this->loadExecutionPlan(file); }
 
-std::string ExecutionPlan::getName() const { return this->name_; }
-
-int ExecutionPlan::nranksPerNode() const { return this->nranksPerNode_; }
-
-std::vector<ChannelInfo> ExecutionPlan::getChannelInfos(int rank, ChannelType channelType) const {
+std::vector<ChannelInfo> ExecutionPlan::Impl::getChannelInfos(int rank, ChannelType channelType) const {
   auto pred = [channelType](const ChannelInfo& info) { return info.channelType == channelType; };
-  return filter(this->channelInfos_.at(rank), pred);
+  return filter(this->channelInfos.at(rank), pred);
 }
-
-std::vector<ChannelInfo> ExecutionPlan::getChannelInfos(int rank, BufferType dstBufferType) const {
+std::vector<ChannelInfo> ExecutionPlan::Impl::getChannelInfos(int rank, BufferType dstBufferType) const {
   auto pred = [dstBufferType](const ChannelInfo& info) { return info.dstBufferType == dstBufferType; };
-  return filter(this->channelInfos_.at(rank), pred);
+  return filter(this->channelInfos.at(rank), pred);
+}
+
+std::vector<BufferType> ExecutionPlan::Impl::getConnectedBufferTypes(int rank) const {
+  return std::vector<BufferType>();
+}
+size_t ExecutionPlan::Impl::getScratchBufferSize(int rank, size_t inputSize) const { return 0; };
+std::vector<Operation> ExecutionPlan::Impl::getOperations(int rank, int threadblock) {
+  return std::vector<Operation>();
+}
+std::pair<int, int> ExecutionPlan::Impl::getThreadBlockChannelRange(int rank, int threadblock, BufferType srcBufferType,
+                                                                    BufferType dstBufferType, ChannelType channelType) {
+  return std::make_pair(0, 0);
 }
 
-void ExecutionPlan::loadExecutionPlan(std::ifstream& file) {
+void ExecutionPlan::Impl::loadExecutionPlan(std::ifstream& file) {
   auto convertToBufferType = [](const std::string& str) {
     if (str == "input") {
       return BufferType::INPUT;
@@ -56,8 +64,8 @@ void ExecutionPlan::loadExecutionPlan(std::ifstream& file) {
   };
 
   json obj = json::parse(file);
-  this->name_ = obj["name"];
-  this->nranksPerNode_ = obj["nranksPerNode"];
+  this->name = obj["name"];
+  this->nranksPerNode = obj["nranksPerNode"];
   auto gpus = obj["gpus"];
   for (const auto& gpu : gpus) {
     int rank = gpu["rank"];
@@ -72,7 +80,7 @@ void ExecutionPlan::loadExecutionPlan(std::ifstream& file) {
       }
       channelInfos.push_back(info);
     }
-    this->channelInfos_[rank] = channelInfos;
+    this->channelInfos[rank] = channelInfos;
   }
 }
 
diff --git a/src/executor/executor.cc b/src/executor/executor.cc
index f17808fa7..7ae13dd94 100644
--- a/src/executor/executor.cc
+++ b/src/executor/executor.cc
@@ -1,10 +1,45 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include "executor.hpp"
-
+#include <mscclpp/executor.hpp>
+#include <mscclpp/proxy_channel.hpp>
+#include <mscclpp/sm_channel.hpp>
 #include <set>
 
+#include "execution_plan.hpp"
+
+namespace mscclpp {
+struct ExecutionContextKey {
+  void* sendBuff;
+  void* recvBuff;
+  size_t sendBuffSize;
+  size_t recvBuffSize;
+  std::string plan;
+
+  bool operator==(const ExecutionContextKey& other) const {
+    return sendBuff == other.sendBuff && recvBuff == other.recvBuff && sendBuffSize == other.sendBuffSize &&
+           recvBuffSize == other.recvBuffSize && plan == other.plan;
+  }
+};
+}  // namespace mscclpp
+
+namespace std {
+template <>
+struct hash<std::pair<mscclpp::BufferType, int>> {
+  std::size_t operator()(const std::pair<mscclpp::BufferType, int>& key) const {
+    return std::hash<int>()(key.second) ^ std::hash<int>()(static_cast<int>(key.first));
+  }
+};
+
+template <>
+struct hash<mscclpp::ExecutionContextKey> {
+  std::size_t operator()(const mscclpp::ExecutionContextKey& key) const {
+    return std::hash<void*>()(key.sendBuff) ^ std::hash<void*>()(key.recvBuff) ^ std::hash<size_t>()(key.sendBuffSize) ^
+           std::hash<size_t>()(key.recvBuffSize) ^ std::hash<std::string>()(key.plan);
+  }
+};
+}  // namespace std
+
 namespace {
 static const mscclpp::Transport IBs[] = {mscclpp::Transport::IB0, mscclpp::Transport::IB1, mscclpp::Transport::IB2,
                                          mscclpp::Transport::IB3, mscclpp::Transport::IB4, mscclpp::Transport::IB5,
@@ -13,8 +48,37 @@ static const mscclpp::Transport IBs[] = {mscclpp::Transport::IB0, mscclpp::Trans
 
 namespace mscclpp {
 
-Executor::Executor(std::shared_ptr<Communicator> comm, const std::unordered_map<int, mscclpp::Connection> connections)
-    : impl_(std::make_shared<Impl>(comm, connections)) {}
+struct ExecutionContext {
+  std::unordered_map<std::pair<BufferType, int>, mscclpp::RegisteredMemory> registeredMemories;
+  std::vector<std::shared_ptr<mscclpp::SmDevice2DeviceSemaphore>> smSemaphores;
+  std::vector<mscclpp::SemaphoreId> proxySemaphores;
+  std::vector<mscclpp::SmChannel> smChannels;
+  std::vector<mscclpp::SimpleProxyChannel> proxyChannels;
+  std::vector<DeviceExecutionPlan> deviceExecutionPlans;
+  std::shared_ptr<char> scratchBuffer;
+  size_t scratchBufferSize;
+};
+
+struct Executor::Impl {
+  std::shared_ptr<Communicator> comm;
+  const std::unordered_map<int, std::shared_ptr<Connection>> connections;
+  std::shared_ptr<ProxyService> proxyService;
+  std::unordered_map<ExecutionContextKey, ExecutionContext> contexts;
+
+  Impl(std::shared_ptr<Communicator> comm, const std::unordered_map<int, std::shared_ptr<Connection>> connections);
+  ExecutionContext setupExecutionContext(int rank, void* sendbuff, void* recvbuff, size_t sendBufferSize,
+                                         size_t recvBufferSize, const ExecutionPlan& plan);
+  void setupRegisteredMemories(ExecutionContext& context, void* sendbuff, void* recvbuff, size_t sendBufferSize,
+                               size_t recvBufferSize, int rank, const ExecutionPlan& plan);
+  void setupChannels(ExecutionContext& context, void* sendbuff, void* recvbuff, size_t sendBufferSize, int rank,
+                     const ExecutionPlan& plan);
+  void launchKernel(ExecutionContext& context);
+  ~Impl() = default;
+};
+
+Executor::Executor(std::shared_ptr<Communicator> comm,
+                   const std::unordered_map<int, std::shared_ptr<Connection>> connections)
+    : impl_(std::make_unique<Impl>(comm, connections)) {}
 
 void Executor::execute(void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize,
                        const ExecutionPlan& plan) {
@@ -31,12 +95,12 @@ Executor::Impl::Impl(std::shared_ptr<Communicator> comm,
 
 ExecutionContext Executor::Impl::setupExecutionContext(int rank, void* sendbuff, void* recvbuff, size_t sendBufferSize,
                                                        size_t recvBufferSize, const ExecutionPlan& plan) {
-  ExecutionContextKey key = {sendbuff, recvbuff, sendBufferSize, recvBufferSize, plan.getName()};
+  ExecutionContextKey key = {sendbuff, recvbuff, sendBufferSize, recvBufferSize, plan.impl_->name};
   if (this->contexts.find(key) != this->contexts.end()) {
     return this->contexts[key];
   }
   ExecutionContext context;
-  size_t scratchBufferSize = plan.getScratchBufferSize(rank, sendBufferSize);
+  size_t scratchBufferSize = plan.impl_->getScratchBufferSize(rank, sendBufferSize);
   std::shared_ptr<char> scratchBuffer = allocExtSharedCuda<char>(scratchBufferSize);
   context.scratchBuffer = scratchBuffer;
   context.scratchBufferSize = scratchBufferSize;
@@ -48,7 +112,7 @@ ExecutionContext Executor::Impl::setupExecutionContext(int rank, void* sendbuff,
 void Executor::Impl::setupRegisteredMemories(ExecutionContext& context, void* sendbuff, void* recvbuff,
                                              size_t sendBufferSize, size_t recvBufferSize, int rank,
                                              const ExecutionPlan& plan) {
-  int nranksPerNode = plan.nranksPerNode();
+  int nranksPerNode = plan.impl_->nranksPerNode;
   auto getTransportFlags = [&](std::vector<ChannelInfo>& infos, int rank) {
     TransportFlags flags;
     for (ChannelInfo& info : infos) {
@@ -82,9 +146,9 @@ void Executor::Impl::setupRegisteredMemories(ExecutionContext& context, void* se
     return std::vector<int>(peers.begin(), peers.end());
   };
 
-  std::vector<BufferType> bufferTypes = plan.getConnectedBufferTypes(rank);
+  std::vector<BufferType> bufferTypes = plan.impl_->getConnectedBufferTypes(rank);
   for (BufferType bufferType : bufferTypes) {
-    std::vector<ChannelInfo> channelInfos = plan.getChannelInfos(rank, bufferType);
+    std::vector<ChannelInfo> channelInfos = plan.impl_->getChannelInfos(rank, bufferType);
     TransportFlags transportFlags = getTransportFlags(channelInfos, rank);
     RegisteredMemory memory =
         this->comm->registerMemory(getBufferInfo(bufferType).first, getBufferInfo(bufferType).second, transportFlags);
@@ -95,7 +159,7 @@ void Executor::Impl::setupRegisteredMemories(ExecutionContext& context, void* se
       remoteRegMemoryFutures.push_back(comm->recvMemoryOnSetup(peer, 0));
     }
     comm->setup();
-    for (int i = 0; i < remoteRegMemoryFutures.size(); i++) {
+    for (size_t i = 0; i < remoteRegMemoryFutures.size(); i++) {
       context.registeredMemories[{bufferType, connectedPeers[i]}] = std::move(remoteRegMemoryFutures[i].get());
     }
   }
@@ -107,7 +171,7 @@ void Executor::Impl::setupChannels(ExecutionContext& context, void* sendbuff, vo
   std::vector<std::shared_ptr<SmDevice2DeviceSemaphore>> smSemaphores;
   std::vector<mscclpp::SemaphoreId> proxySemaphores;
   for (ChannelType channelType : channelTypes) {
-    std::vector<ChannelInfo> channelInfos = plan.getChannelInfos(rank, channelType);
+    std::vector<ChannelInfo> channelInfos = plan.impl_->getChannelInfos(rank, channelType);
     for (ChannelInfo& info : channelInfos) {
       for (int peer : info.connectedPeers) {
         if (channelType == ChannelType::SM) {
@@ -135,11 +199,10 @@ void Executor::Impl::setupChannels(ExecutionContext& context, void* sendbuff, vo
     }
   };
   for (ChannelType channelType : channelTypes) {
-    std::vector<ChannelInfo> channelInfos = plan.getChannelInfos(rank, channelType);
+    std::vector<ChannelInfo> channelInfos = plan.impl_->getChannelInfos(rank, channelType);
     int index = 0;
     for (ChannelInfo& info : channelInfos) {
       void* src = getBuffer(info.srcBufferType);
-      void* dst = getBuffer(info.dstBufferType);
       TransportFlags transport = context.registeredMemories.begin()->second.transports();
       RegisteredMemory localMemory = this->comm->registerMemory(src, sendBufferSize, transport);
       for (int peer : info.connectedPeers) {
@@ -159,7 +222,8 @@ void Executor::Impl::setupChannels(ExecutionContext& context, void* sendbuff, vo
 
 void Executor::Impl::launchKernel(ExecutionContext& context) {
   // Need to change to use flush function and make sure the proxy service will get the latest data.
-  this->proxyService->startProxy();
+  // may need atomic variable
+  // this->proxyService->startProxy();
 }
 
 }  // namespace mscclpp
diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp
index 220cda9a6..e70ef5c6a 100644
--- a/src/include/execution_plan.hpp
+++ b/src/include/execution_plan.hpp
@@ -5,6 +5,7 @@
 #define MSCCLPP_EXECUTOR_PLAN_HPP_
 
 #include <mscclpp/core.hpp>
+#include <mscclpp/executor.hpp>
 #include <mscclpp/proxy_channel.hpp>
 #include <mscclpp/sm_channel.hpp>
 #include <string>
@@ -71,11 +72,11 @@ struct DeviceExecutionPlan {
   Operation operations[1];
 };
 
-class ExecutionPlan {
+struct ExecutionPlan::Impl {
  public:
-  ExecutionPlan(std::ifstream& file);
-  std::string getName() const;
-  int nranksPerNode() const;
+  Impl(std::ifstream& file);
+  ~Impl() = default;
+
   std::vector<ChannelInfo> getChannelInfos(int rank, ChannelType channelType) const;
   std::vector<ChannelInfo> getChannelInfos(int rank, BufferType bufferType) const;
   std::vector<BufferType> getConnectedBufferTypes(int rank) const;
@@ -83,16 +84,13 @@ class ExecutionPlan {
   std::vector<Operation> getOperations(int rank, int threadblock);
   std::pair<int, int> getThreadBlockChannelRange(int rank, int threadblock, BufferType srcBufferType,
                                                  BufferType dstBufferType, ChannelType channelType);
-  ~ExecutionPlan() = default;
-
- private:
   void loadExecutionPlan(std::ifstream& file);
 
   // operations for [rank][threadblock]
-  std::vector<std::vector<Operation>> operations_;
-  std::unordered_map<int, std::vector<ChannelInfo>> channelInfos_;
-  std::string name_;
-  int nranksPerNode_;
+  std::vector<std::vector<Operation>> operations;
+  std::unordered_map<int, std::vector<ChannelInfo>> channelInfos;
+  std::string name;
+  int nranksPerNode;
 };
 
 }  // namespace mscclpp
diff --git a/src/include/executor.hpp b/src/include/executor.hpp
deleted file mode 100644
index 88469416d..000000000
--- a/src/include/executor.hpp
+++ /dev/null
@@ -1,84 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#ifndef MSCCLPP_EXECUTOR_HPP_
-#define MSCCLPP_EXECUTOR_HPP_
-
-#include <functional>
-#include <memory>
-#include <string>
-#include <unordered_map>
-
-#include "execution_plan.hpp"
-
-namespace mscclpp {
-struct ExecutionContextKey {
-  void* sendBuff;
-  void* recvBuff;
-  size_t sendBuffSize;
-  size_t recvBuffSize;
-  std::string plan;
-};
-}  // namespace mscclpp
-
-namespace std {
-template <>
-struct hash<std::pair<mscclpp::BufferType, int>> {
-  std::size_t operator()(const std::pair<mscclpp::BufferType, int>& key) const {
-    return std::hash<int>()(key.second) ^ std::hash<int>()(static_cast<int>(key.first));
-  }
-};
-
-template <>
-struct hash<mscclpp::ExecutionContextKey> {
-  std::size_t operator()(const mscclpp::ExecutionContextKey& key) const {
-    return std::hash<void*>()(key.sendBuff) ^ std::hash<void*>()(key.recvBuff) ^ std::hash<size_t>()(key.sendBuffSize) ^
-           std::hash<size_t>()(key.recvBuffSize) ^ std::hash<std::string>()(key.plan);
-  }
-};
-}  // namespace std
-
-namespace mscclpp {
-class Executor {
- public:
-  Executor(std::shared_ptr<Communicator> comm, const std::unordered_map<int, mscclpp::Connection> connections);
-  void execute(void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, const ExecutionPlan& plan);
-  ~Executor() = default;
-
- private:
-  struct Impl;
-
-  std::shared_ptr<Impl> impl_;
-};
-
-struct ExecutionContext {
-  std::unordered_map<std::pair<BufferType, int>, mscclpp::RegisteredMemory> registeredMemories;
-  std::vector<std::shared_ptr<mscclpp::SmDevice2DeviceSemaphore>> smSemaphores;
-  std::vector<mscclpp::SemaphoreId> proxySemaphores;
-  std::vector<mscclpp::SmChannel> smChannels;
-  std::vector<mscclpp::SimpleProxyChannel> proxyChannels;
-  std::vector<DeviceExecutionPlan> deviceExecutionPlans;
-  std::shared_ptr<char> scratchBuffer;
-  size_t scratchBufferSize;
-};
-
-struct Executor::Impl {
-  std::shared_ptr<Communicator> comm;
-  const std::unordered_map<int, std::shared_ptr<Connection>> connections;
-  std::shared_ptr<ProxyService> proxyService;
-  std::unordered_map<ExecutionContextKey, ExecutionContext> contexts;
-
-  Impl(std::shared_ptr<Communicator> comm, const std::unordered_map<int, std::shared_ptr<Connection>> connections);
-  ExecutionContext setupExecutionContext(int rank, void* sendbuff, void* recvbuff, size_t sendBufferSize,
-                                         size_t recvBufferSize, const ExecutionPlan& plan);
-  void setupRegisteredMemories(ExecutionContext& context, void* sendbuff, void* recvbuff, size_t sendBufferSize,
-                               size_t recvBufferSize, int rank, const ExecutionPlan& plan);
-  void setupChannels(ExecutionContext& context, void* sendbuff, void* recvbuff, size_t sendBufferSize, int rank,
-                     const ExecutionPlan& plan);
-  void launchKernel(ExecutionContext& context);
-  ~Impl() = default;
-};
-
-}  // namespace mscclpp
-
-#endif  // MSCCLPP_EXECUTOR_HPP_

From edf93df44e3b29f0329fcd8db5227090cd511290 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Mon, 1 Apr 2024 11:40:54 +0000
Subject: [PATCH 08/51] WIP

---
 src/executor/executor.cc | 270 +++++++++++++++++++--------------------
 1 file changed, 128 insertions(+), 142 deletions(-)

diff --git a/src/executor/executor.cc b/src/executor/executor.cc
index 7ae13dd94..324671e27 100644
--- a/src/executor/executor.cc
+++ b/src/executor/executor.cc
@@ -65,165 +65,151 @@ struct Executor::Impl {
   std::shared_ptr<ProxyService> proxyService;
   std::unordered_map<ExecutionContextKey, ExecutionContext> contexts;
 
-  Impl(std::shared_ptr<Communicator> comm, const std::unordered_map<int, std::shared_ptr<Connection>> connections);
-  ExecutionContext setupExecutionContext(int rank, void* sendbuff, void* recvbuff, size_t sendBufferSize,
-                                         size_t recvBufferSize, const ExecutionPlan& plan);
-  void setupRegisteredMemories(ExecutionContext& context, void* sendbuff, void* recvbuff, size_t sendBufferSize,
-                               size_t recvBufferSize, int rank, const ExecutionPlan& plan);
-  void setupChannels(ExecutionContext& context, void* sendbuff, void* recvbuff, size_t sendBufferSize, int rank,
-                     const ExecutionPlan& plan);
-  void launchKernel(ExecutionContext& context);
+  Impl(std::shared_ptr<Communicator> comm, const std::unordered_map<int, std::shared_ptr<Connection>> connections)
+      : comm(comm), connections(connections) {
+    this->proxyService = std::make_shared<ProxyService>();
+  }
   ~Impl() = default;
-};
 
-Executor::Executor(std::shared_ptr<Communicator> comm,
-                   const std::unordered_map<int, std::shared_ptr<Connection>> connections)
-    : impl_(std::make_unique<Impl>(comm, connections)) {}
-
-void Executor::execute(void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize,
-                       const ExecutionPlan& plan) {
-  ExecutionContext context =
-      this->impl_->setupExecutionContext(0, sendbuff, recvBuff, sendBuffSize, recvBuffSize, plan);
-  this->impl_->launchKernel(context);
-}
-
-Executor::Impl::Impl(std::shared_ptr<Communicator> comm,
-                     const std::unordered_map<int, std::shared_ptr<Connection>> connections)
-    : comm(comm), connections(connections) {
-  this->proxyService = std::make_shared<ProxyService>();
-}
-
-ExecutionContext Executor::Impl::setupExecutionContext(int rank, void* sendbuff, void* recvbuff, size_t sendBufferSize,
-                                                       size_t recvBufferSize, const ExecutionPlan& plan) {
-  ExecutionContextKey key = {sendbuff, recvbuff, sendBufferSize, recvBufferSize, plan.impl_->name};
-  if (this->contexts.find(key) != this->contexts.end()) {
-    return this->contexts[key];
+  ExecutionContext setupExecutionContext(int rank, void* sendbuff, void* recvbuff, size_t sendBufferSize,
+                                         size_t recvBufferSize, const ExecutionPlan& plan) {
+    ExecutionContextKey key = {sendbuff, recvbuff, sendBufferSize, recvBufferSize, plan.impl_->name};
+    if (this->contexts.find(key) != this->contexts.end()) {
+      return this->contexts[key];
+    }
+    ExecutionContext context;
+    size_t scratchBufferSize = plan.impl_->getScratchBufferSize(rank, sendBufferSize);
+    std::shared_ptr<char> scratchBuffer = allocExtSharedCuda<char>(scratchBufferSize);
+    context.scratchBuffer = scratchBuffer;
+    context.scratchBufferSize = scratchBufferSize;
+    this->setupRegisteredMemories(context, sendbuff, recvbuff, sendBufferSize, recvBufferSize, rank, plan);
+    this->setupChannels(context, sendbuff, recvbuff, sendBufferSize, rank, plan);
+    return context;
   }
-  ExecutionContext context;
-  size_t scratchBufferSize = plan.impl_->getScratchBufferSize(rank, sendBufferSize);
-  std::shared_ptr<char> scratchBuffer = allocExtSharedCuda<char>(scratchBufferSize);
-  context.scratchBuffer = scratchBuffer;
-  context.scratchBufferSize = scratchBufferSize;
-  this->setupRegisteredMemories(context, sendbuff, recvbuff, sendBufferSize, recvBufferSize, rank, plan);
-  this->setupChannels(context, sendbuff, recvbuff, sendBufferSize, rank, plan);
-  return context;
-}
 
-void Executor::Impl::setupRegisteredMemories(ExecutionContext& context, void* sendbuff, void* recvbuff,
-                                             size_t sendBufferSize, size_t recvBufferSize, int rank,
-                                             const ExecutionPlan& plan) {
-  int nranksPerNode = plan.impl_->nranksPerNode;
-  auto getTransportFlags = [&](std::vector<ChannelInfo>& infos, int rank) {
-    TransportFlags flags;
-    for (ChannelInfo& info : infos) {
-      if (info.channelType == ChannelType::SM) {
-        flags |= Transport::CudaIpc;
-      } else if (info.channelType == ChannelType::PROXY) {
-        flags |= IBs[rank % nranksPerNode];
+  void setupRegisteredMemories(ExecutionContext& context, void* sendbuff, void* recvbuff, size_t sendBufferSize,
+                               size_t recvBufferSize, int rank, const ExecutionPlan& plan) {
+    int nranksPerNode = plan.impl_->nranksPerNode;
+    auto getTransportFlags = [&](std::vector<ChannelInfo>& infos, int rank) {
+      TransportFlags flags;
+      for (ChannelInfo& info : infos) {
+        if (info.channelType == ChannelType::SM) {
+          flags |= Transport::CudaIpc;
+        } else if (info.channelType == ChannelType::PROXY) {
+          flags |= IBs[rank % nranksPerNode];
+        }
       }
-    }
-    return flags;
-  };
-  auto getBufferInfo = [&](BufferType type) {
-    switch (type) {
-      case BufferType::INPUT:
-        return std::make_pair(sendbuff, sendBufferSize);
-      case BufferType::OUTPUT:
-        return std::make_pair(recvbuff, recvBufferSize);
-      case BufferType::SCRATCH:
-        return std::make_pair((void*)context.scratchBuffer.get(), context.scratchBufferSize);
-      default:
-        throw std::runtime_error("Invalid buffer type");
-    }
-  };
-  auto getConnectedPeers = [&](std::vector<ChannelInfo>& infos) {
-    std::set<int> peers;
-    for (ChannelInfo& info : infos) {
-      for (int peer : info.connectedPeers) {
-        peers.insert(peer);
+      return flags;
+    };
+    auto getBufferInfo = [&](BufferType type) {
+      switch (type) {
+        case BufferType::INPUT:
+          return std::make_pair(sendbuff, sendBufferSize);
+        case BufferType::OUTPUT:
+          return std::make_pair(recvbuff, recvBufferSize);
+        case BufferType::SCRATCH:
+          return std::make_pair((void*)context.scratchBuffer.get(), context.scratchBufferSize);
+        default:
+          throw std::runtime_error("Invalid buffer type");
+      }
+    };
+    auto getConnectedPeers = [&](std::vector<ChannelInfo>& infos) {
+      std::set<int> peers;
+      for (ChannelInfo& info : infos) {
+        for (int peer : info.connectedPeers) {
+          peers.insert(peer);
+        }
+      }
+      return std::vector<int>(peers.begin(), peers.end());
+    };
+
+    std::vector<BufferType> bufferTypes = plan.impl_->getConnectedBufferTypes(rank);
+    for (BufferType bufferType : bufferTypes) {
+      std::vector<ChannelInfo> channelInfos = plan.impl_->getChannelInfos(rank, bufferType);
+      TransportFlags transportFlags = getTransportFlags(channelInfos, rank);
+      RegisteredMemory memory =
+          this->comm->registerMemory(getBufferInfo(bufferType).first, getBufferInfo(bufferType).second, transportFlags);
+      std::vector<int> connectedPeers = getConnectedPeers(channelInfos);
+      std::vector<mscclpp::NonblockingFuture<mscclpp::RegisteredMemory>> remoteRegMemoryFutures;
+      for (int peer : connectedPeers) {
+        comm->sendMemoryOnSetup(memory, peer, 0);
+        remoteRegMemoryFutures.push_back(comm->recvMemoryOnSetup(peer, 0));
+      }
+      comm->setup();
+      for (size_t i = 0; i < remoteRegMemoryFutures.size(); i++) {
+        context.registeredMemories[{bufferType, connectedPeers[i]}] = std::move(remoteRegMemoryFutures[i].get());
       }
-    }
-    return std::vector<int>(peers.begin(), peers.end());
-  };
-
-  std::vector<BufferType> bufferTypes = plan.impl_->getConnectedBufferTypes(rank);
-  for (BufferType bufferType : bufferTypes) {
-    std::vector<ChannelInfo> channelInfos = plan.impl_->getChannelInfos(rank, bufferType);
-    TransportFlags transportFlags = getTransportFlags(channelInfos, rank);
-    RegisteredMemory memory =
-        this->comm->registerMemory(getBufferInfo(bufferType).first, getBufferInfo(bufferType).second, transportFlags);
-    std::vector<int> connectedPeers = getConnectedPeers(channelInfos);
-    std::vector<mscclpp::NonblockingFuture<mscclpp::RegisteredMemory>> remoteRegMemoryFutures;
-    for (int peer : connectedPeers) {
-      comm->sendMemoryOnSetup(memory, peer, 0);
-      remoteRegMemoryFutures.push_back(comm->recvMemoryOnSetup(peer, 0));
-    }
-    comm->setup();
-    for (size_t i = 0; i < remoteRegMemoryFutures.size(); i++) {
-      context.registeredMemories[{bufferType, connectedPeers[i]}] = std::move(remoteRegMemoryFutures[i].get());
     }
   }
-}
 
-void Executor::Impl::setupChannels(ExecutionContext& context, void* sendbuff, void* recvbuff, size_t sendBufferSize,
-                                   int rank, const ExecutionPlan& plan) {
-  const auto channelTypes = {ChannelType::SM, ChannelType::PROXY};
-  std::vector<std::shared_ptr<SmDevice2DeviceSemaphore>> smSemaphores;
-  std::vector<mscclpp::SemaphoreId> proxySemaphores;
-  for (ChannelType channelType : channelTypes) {
-    std::vector<ChannelInfo> channelInfos = plan.impl_->getChannelInfos(rank, channelType);
-    for (ChannelInfo& info : channelInfos) {
-      for (int peer : info.connectedPeers) {
-        if (channelType == ChannelType::SM) {
-          smSemaphores.push_back(std::make_shared<SmDevice2DeviceSemaphore>(*this->comm, this->connections.at(peer)));
-        } else if (channelType == ChannelType::PROXY) {
-          proxySemaphores.push_back(this->proxyService->buildAndAddSemaphore(*this->comm, this->connections.at(peer)));
+  void setupChannels(ExecutionContext& context, void* sendbuff, void* recvbuff, size_t sendBufferSize, int rank,
+                     const ExecutionPlan& plan) {
+    const auto channelTypes = {ChannelType::SM, ChannelType::PROXY};
+    std::vector<std::shared_ptr<SmDevice2DeviceSemaphore>> smSemaphores;
+    std::vector<mscclpp::SemaphoreId> proxySemaphores;
+    for (ChannelType channelType : channelTypes) {
+      std::vector<ChannelInfo> channelInfos = plan.impl_->getChannelInfos(rank, channelType);
+      for (ChannelInfo& info : channelInfos) {
+        for (int peer : info.connectedPeers) {
+          if (channelType == ChannelType::SM) {
+            smSemaphores.push_back(std::make_shared<SmDevice2DeviceSemaphore>(*this->comm, this->connections.at(peer)));
+          } else if (channelType == ChannelType::PROXY) {
+            proxySemaphores.push_back(
+                this->proxyService->buildAndAddSemaphore(*this->comm, this->connections.at(peer)));
+          }
         }
       }
     }
-  }
-  this->comm->setup();
-  context.smSemaphores = std::move(smSemaphores);
-  context.proxySemaphores = std::move(proxySemaphores);
-
-  auto getBuffer = [&](BufferType type) {
-    switch (type) {
-      case BufferType::INPUT:
-        return sendbuff;
-      case BufferType::OUTPUT:
-        return recvbuff;
-      case BufferType::SCRATCH:
-        return (void*)context.scratchBuffer.get();
-      default:
-        throw std::runtime_error("Invalid buffer type");
-    }
-  };
-  for (ChannelType channelType : channelTypes) {
-    std::vector<ChannelInfo> channelInfos = plan.impl_->getChannelInfos(rank, channelType);
-    int index = 0;
-    for (ChannelInfo& info : channelInfos) {
-      void* src = getBuffer(info.srcBufferType);
-      TransportFlags transport = context.registeredMemories.begin()->second.transports();
-      RegisteredMemory localMemory = this->comm->registerMemory(src, sendBufferSize, transport);
-      for (int peer : info.connectedPeers) {
-        if (channelType == ChannelType::SM) {
-          context.smChannels.emplace_back(smSemaphores[index], context.registeredMemories[{info.dstBufferType, peer}],
-                                          src, nullptr);
-        } else if (channelType == ChannelType::PROXY) {
-          context.proxyChannels.emplace_back(
-              this->proxyService->proxyChannel(proxySemaphores[index]),
-              this->proxyService->addMemory(context.registeredMemories[{info.dstBufferType, peer}]),
-              this->proxyService->addMemory(localMemory));
+    this->comm->setup();
+    context.smSemaphores = std::move(smSemaphores);
+    context.proxySemaphores = std::move(proxySemaphores);
+
+    auto getBuffer = [&](BufferType type) {
+      switch (type) {
+        case BufferType::INPUT:
+          return sendbuff;
+        case BufferType::OUTPUT:
+          return recvbuff;
+        case BufferType::SCRATCH:
+          return (void*)context.scratchBuffer.get();
+        default:
+          throw std::runtime_error("Invalid buffer type");
+      }
+    };
+    for (ChannelType channelType : channelTypes) {
+      std::vector<ChannelInfo> channelInfos = plan.impl_->getChannelInfos(rank, channelType);
+      int index = 0;
+      for (ChannelInfo& info : channelInfos) {
+        void* src = getBuffer(info.srcBufferType);
+        TransportFlags transport = context.registeredMemories.begin()->second.transports();
+        RegisteredMemory localMemory = this->comm->registerMemory(src, sendBufferSize, transport);
+        for (int peer : info.connectedPeers) {
+          if (channelType == ChannelType::SM) {
+            context.smChannels.emplace_back(smSemaphores[index], context.registeredMemories[{info.dstBufferType, peer}],
+                                            src, nullptr);
+          } else if (channelType == ChannelType::PROXY) {
+            context.proxyChannels.emplace_back(
+                this->proxyService->proxyChannel(proxySemaphores[index]),
+                this->proxyService->addMemory(context.registeredMemories[{info.dstBufferType, peer}]),
+                this->proxyService->addMemory(localMemory));
+          }
         }
       }
     }
   }
-}
 
-void Executor::Impl::launchKernel(ExecutionContext& context) {
-  // Need to change to use flush function and make sure the proxy service will get the latest data.
-  // may need atomic variable
-  // this->proxyService->startProxy();
+  void launchKernel(ExecutionContext& context) {}
+};
+
+Executor::Executor(std::shared_ptr<Communicator> comm,
+                   const std::unordered_map<int, std::shared_ptr<Connection>> connections)
+    : impl_(std::make_unique<Impl>(comm, connections)) {}
+
+void Executor::execute(void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize,
+                       const ExecutionPlan& plan) {
+  ExecutionContext context =
+      this->impl_->setupExecutionContext(0, sendbuff, recvBuff, sendBuffSize, recvBuffSize, plan);
+  this->impl_->launchKernel(context);
 }
 
 }  // namespace mscclpp

From c071e5d789a372def895b315126e1ea8d49fadc7 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Mon, 1 Apr 2024 12:00:38 +0000
Subject: [PATCH 09/51] add test file

---
 include/mscclpp/executor.hpp   |  2 +-
 src/executor/execution_plan.cc |  2 ++
 src/executor/executor.cc       |  2 ++
 test/CMakeLists.txt            |  1 +
 test/executor_test.cc          | 30 ++++++++++++++++++++++++++++++
 5 files changed, 36 insertions(+), 1 deletion(-)
 create mode 100644 test/executor_test.cc

diff --git a/include/mscclpp/executor.hpp b/include/mscclpp/executor.hpp
index c9f7e7f59..c64230aa6 100644
--- a/include/mscclpp/executor.hpp
+++ b/include/mscclpp/executor.hpp
@@ -27,7 +27,7 @@ class Executor {
   Executor(std::shared_ptr<Communicator> comm, const std::unordered_map<int, std::shared_ptr<Connection>> connections);
   Executor(const Executor&) = delete;
   Executor& operator=(const Executor&) = delete;
-  ~Executor() = default;
+  ~Executor();
 
   void execute(void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, const ExecutionPlan& plan);
 
diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc
index 5fe314cfc..65d372502 100644
--- a/src/executor/execution_plan.cc
+++ b/src/executor/execution_plan.cc
@@ -84,4 +84,6 @@ void ExecutionPlan::Impl::loadExecutionPlan(std::ifstream& file) {
   }
 }
 
+ExecutionPlan::ExecutionPlan(std::ifstream& file) : impl_(std::make_shared<Impl>(file)) {}
+
 }  // namespace mscclpp
diff --git a/src/executor/executor.cc b/src/executor/executor.cc
index 324671e27..fd75b4d61 100644
--- a/src/executor/executor.cc
+++ b/src/executor/executor.cc
@@ -212,4 +212,6 @@ void Executor::execute(void* sendbuff, void* recvBuff, size_t sendBuffSize, size
   this->impl_->launchKernel(context);
 }
 
+Executor::~Executor() = default;
+
 }  // namespace mscclpp
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index da47066ea..501f96ab0 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -24,6 +24,7 @@ endfunction()
 add_test_executable(allgather_test_cpp allgather_test_cpp.cu)
 add_test_executable(allgather_test_host_offloading allgather_test_host_offloading.cu)
 add_test_executable(nvls_test nvls_test.cu)
+add_test_executable(executor_test executor_test.cc)
 
 configure_file(run_mpi_test.sh.in run_mpi_test.sh)
 
diff --git a/test/executor_test.cc b/test/executor_test.cc
new file mode 100644
index 000000000..f6ad8bd02
--- /dev/null
+++ b/test/executor_test.cc
@@ -0,0 +1,30 @@
+#include <mpi.h>
+
+#include <fstream>
+#include <mscclpp/executor.hpp>
+
+int main() {
+  int rank;
+  int world_size;
+  MPI_Init(NULL, NULL);
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+  auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(rank, world_size);
+  mscclpp::UniqueId id;
+  if (rank == 0) {
+    id = bootstrap->createUniqueId();
+  }
+  MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
+  bootstrap->initialize(id);
+  auto comm = std::make_shared<mscclpp::Communicator>(bootstrap);
+  std::shared_ptr<mscclpp::Executor> executor =
+      std::make_shared<mscclpp::Executor>(comm, std::unordered_map<int, std::shared_ptr<mscclpp::Connection>>());
+  std::ifstream file("execution_plan.json");
+  mscclpp::ExecutionPlan plan(file);
+  std::shared_ptr<char> sendbuff = mscclpp::allocExtSharedCuda<char>(1024);
+  std::shared_ptr<char> recvbuff = mscclpp::allocExtSharedCuda<char>(1024);
+  executor->execute(sendbuff.get(), recvbuff.get(), 1024, 1024, plan);
+
+  MPI_Finalize();
+  return 0;
+}

From 580e4a4b36f68c73bf77e8c05b58cc0f3d1c1ea6 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Mon, 1 Apr 2024 14:33:07 +0000
Subject: [PATCH 10/51] WIP

---
 include/mscclpp/executor.hpp   |  2 +-
 src/executor/execution_plan.cc | 11 +++++++++++
 src/executor/executor.cc       | 36 +++++++++++++++++++++++++---------
 src/include/execution_plan.hpp |  1 +
 test/executor_test.cc          |  3 +--
 5 files changed, 41 insertions(+), 12 deletions(-)

diff --git a/include/mscclpp/executor.hpp b/include/mscclpp/executor.hpp
index c64230aa6..895fc03d8 100644
--- a/include/mscclpp/executor.hpp
+++ b/include/mscclpp/executor.hpp
@@ -24,7 +24,7 @@ class ExecutionPlan {
 
 class Executor {
  public:
-  Executor(std::shared_ptr<Communicator> comm, const std::unordered_map<int, std::shared_ptr<Connection>> connections);
+  Executor(std::shared_ptr<Communicator> comm);
   Executor(const Executor&) = delete;
   Executor& operator=(const Executor&) = delete;
   ~Executor();
diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc
index 65d372502..2020b970d 100644
--- a/src/executor/execution_plan.cc
+++ b/src/executor/execution_plan.cc
@@ -5,6 +5,7 @@
 
 #include <fstream>
 #include <nlohmann/json.hpp>
+#include <set>
 
 namespace {
 template <typename T, typename Predicate>
@@ -29,6 +30,16 @@ std::vector<ChannelInfo> ExecutionPlan::Impl::getChannelInfos(int rank, BufferTy
   return filter(this->channelInfos.at(rank), pred);
 }
 
+std::vector<int> ExecutionPlan::Impl::getConnectedPeers(int rank) const {
+  std::set<int> peers;
+  for (const auto& info : this->channelInfos.at(rank)) {
+    for (int peer : info.connectedPeers) {
+      peers.insert(peer);
+    }
+  }
+  return std::vector<int>(peers.begin(), peers.end());
+}
+
 std::vector<BufferType> ExecutionPlan::Impl::getConnectedBufferTypes(int rank) const {
   return std::vector<BufferType>();
 }
diff --git a/src/executor/executor.cc b/src/executor/executor.cc
index fd75b4d61..4a3e367e3 100644
--- a/src/executor/executor.cc
+++ b/src/executor/executor.cc
@@ -41,6 +41,10 @@ struct hash<mscclpp::ExecutionContextKey> {
 }  // namespace std
 
 namespace {
+auto inSameNode = [](int rank1, int rank2, int nranksPerNode) {
+  return rank1 / nranksPerNode == rank2 / nranksPerNode;
+};
+
 static const mscclpp::Transport IBs[] = {mscclpp::Transport::IB0, mscclpp::Transport::IB1, mscclpp::Transport::IB2,
                                          mscclpp::Transport::IB3, mscclpp::Transport::IB4, mscclpp::Transport::IB5,
                                          mscclpp::Transport::IB6, mscclpp::Transport::IB7};
@@ -61,14 +65,11 @@ struct ExecutionContext {
 
 struct Executor::Impl {
   std::shared_ptr<Communicator> comm;
-  const std::unordered_map<int, std::shared_ptr<Connection>> connections;
+  std::unordered_map<int, std::shared_ptr<Connection>> connections;
   std::shared_ptr<ProxyService> proxyService;
   std::unordered_map<ExecutionContextKey, ExecutionContext> contexts;
 
-  Impl(std::shared_ptr<Communicator> comm, const std::unordered_map<int, std::shared_ptr<Connection>> connections)
-      : comm(comm), connections(connections) {
-    this->proxyService = std::make_shared<ProxyService>();
-  }
+  Impl(std::shared_ptr<Communicator> comm) : comm(comm) { this->proxyService = std::make_shared<ProxyService>(); }
   ~Impl() = default;
 
   ExecutionContext setupExecutionContext(int rank, void* sendbuff, void* recvbuff, size_t sendBufferSize,
@@ -82,11 +83,26 @@ struct Executor::Impl {
     std::shared_ptr<char> scratchBuffer = allocExtSharedCuda<char>(scratchBufferSize);
     context.scratchBuffer = scratchBuffer;
     context.scratchBufferSize = scratchBufferSize;
+    this->setupConnections(context, rank, plan);
     this->setupRegisteredMemories(context, sendbuff, recvbuff, sendBufferSize, recvBufferSize, rank, plan);
     this->setupChannels(context, sendbuff, recvbuff, sendBufferSize, rank, plan);
     return context;
   }
 
+  void setupConnections(ExecutionContext& context, int rank, const ExecutionPlan& plan) {
+    std::vector<int> connectedPeers = plan.impl_->getConnectedPeers(rank);
+    std::vector<mscclpp::NonblockingFuture<std::shared_ptr<mscclpp::Connection>>> connectionFutures;
+    for (int peer : connectedPeers) {
+      Transport transport = inSameNode(rank, peer, plan.impl_->nranksPerNode) ? Transport::CudaIpc
+                                                                              : IBs[rank % plan.impl_->nranksPerNode];
+      connectionFutures.push_back(this->comm->connectOnSetup(peer, 0, transport));
+    }
+    this->comm->setup();
+    for (size_t i = 0; i < connectionFutures.size(); i++) {
+      this->connections[connectedPeers[i]] = connectionFutures[i].get();
+    }
+  }
+
   void setupRegisteredMemories(ExecutionContext& context, void* sendbuff, void* recvbuff, size_t sendBufferSize,
                                size_t recvBufferSize, int rank, const ExecutionPlan& plan) {
     int nranksPerNode = plan.impl_->nranksPerNode;
@@ -96,7 +112,11 @@ struct Executor::Impl {
         if (info.channelType == ChannelType::SM) {
           flags |= Transport::CudaIpc;
         } else if (info.channelType == ChannelType::PROXY) {
-          flags |= IBs[rank % nranksPerNode];
+          for (int peer : info.connectedPeers) {
+            if (inSameNode(rank, peer, nranksPerNode)) {
+              flags |= IBs[rank % nranksPerNode];
+            }
+          }
         }
       }
       return flags;
@@ -201,9 +221,7 @@ struct Executor::Impl {
   void launchKernel(ExecutionContext& context) {}
 };
 
-Executor::Executor(std::shared_ptr<Communicator> comm,
-                   const std::unordered_map<int, std::shared_ptr<Connection>> connections)
-    : impl_(std::make_unique<Impl>(comm, connections)) {}
+Executor::Executor(std::shared_ptr<Communicator> comm) : impl_(std::make_unique<Impl>(comm)) {}
 
 void Executor::execute(void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize,
                        const ExecutionPlan& plan) {
diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp
index e70ef5c6a..de1ba4496 100644
--- a/src/include/execution_plan.hpp
+++ b/src/include/execution_plan.hpp
@@ -79,6 +79,7 @@ struct ExecutionPlan::Impl {
 
   std::vector<ChannelInfo> getChannelInfos(int rank, ChannelType channelType) const;
   std::vector<ChannelInfo> getChannelInfos(int rank, BufferType bufferType) const;
+  std::vector<int> getConnectedPeers(int rank) const;
   std::vector<BufferType> getConnectedBufferTypes(int rank) const;
   size_t getScratchBufferSize(int rank, size_t inputSize) const;
   std::vector<Operation> getOperations(int rank, int threadblock);
diff --git a/test/executor_test.cc b/test/executor_test.cc
index f6ad8bd02..7059631ab 100644
--- a/test/executor_test.cc
+++ b/test/executor_test.cc
@@ -17,8 +17,7 @@ int main() {
   MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
   bootstrap->initialize(id);
   auto comm = std::make_shared<mscclpp::Communicator>(bootstrap);
-  std::shared_ptr<mscclpp::Executor> executor =
-      std::make_shared<mscclpp::Executor>(comm, std::unordered_map<int, std::shared_ptr<mscclpp::Connection>>());
+  std::shared_ptr<mscclpp::Executor> executor = std::make_shared<mscclpp::Executor>(comm);
   std::ifstream file("execution_plan.json");
   mscclpp::ExecutionPlan plan(file);
   std::shared_ptr<char> sendbuff = mscclpp::allocExtSharedCuda<char>(1024);

From d7026fbfdd63f5743ede571024420237485f7ef5 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Tue, 2 Apr 2024 06:03:33 +0000
Subject: [PATCH 11/51] WIP

---
 src/executor/execution_plan.cc      |  19 +-
 src/include/execution_plan.hpp      |   3 +
 test/execution-files/allreduce.json | 451 ++++++++++++++++++++++++++++
 test/executor_test.cc               |   7 +-
 4 files changed, 472 insertions(+), 8 deletions(-)
 create mode 100644 test/execution-files/allreduce.json

diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc
index 2020b970d..a962e956f 100644
--- a/src/executor/execution_plan.cc
+++ b/src/executor/execution_plan.cc
@@ -43,7 +43,9 @@ std::vector<int> ExecutionPlan::Impl::getConnectedPeers(int rank) const {
 std::vector<BufferType> ExecutionPlan::Impl::getConnectedBufferTypes(int rank) const {
   return std::vector<BufferType>();
 }
-size_t ExecutionPlan::Impl::getScratchBufferSize(int rank, size_t inputSize) const { return 0; };
+size_t ExecutionPlan::Impl::getScratchBufferSize(int rank, size_t inputSize) const {
+  return inputSize / this->inputChunks.at(rank) * this->scratchChunks.at(rank);
+}
 std::vector<Operation> ExecutionPlan::Impl::getOperations(int rank, int threadblock) {
   return std::vector<Operation>();
 }
@@ -54,11 +56,11 @@ std::pair<int, int> ExecutionPlan::Impl::getThreadBlockChannelRange(int rank, in
 
 void ExecutionPlan::Impl::loadExecutionPlan(std::ifstream& file) {
   auto convertToBufferType = [](const std::string& str) {
-    if (str == "input") {
+    if (str == "i") {
       return BufferType::INPUT;
-    } else if (str == "output") {
+    } else if (str == "o") {
       return BufferType::OUTPUT;
-    } else if (str == "scratch") {
+    } else if (str == "s") {
       return BufferType::SCRATCH;
     } else {
       throw std::runtime_error("Invalid buffer type");
@@ -79,12 +81,15 @@ void ExecutionPlan::Impl::loadExecutionPlan(std::ifstream& file) {
   this->nranksPerNode = obj["nranksPerNode"];
   auto gpus = obj["gpus"];
   for (const auto& gpu : gpus) {
-    int rank = gpu["rank"];
+    int rank = gpu["id"];
+    this->inputChunks[rank] = gpu["inputChunks"];
+    this->outputChunks[rank] = gpu["outputChunks"];
+    this->scratchChunks[rank] = gpu["scratchChunks"];
     std::vector<ChannelInfo> channelInfos;
     for (const auto& channel : gpu["channels"]) {
       ChannelInfo info;
-      info.srcBufferType = convertToBufferType(channel["srcBuffer"]);
-      info.dstBufferType = convertToBufferType(channel["dstBuffer"]);
+      info.srcBufferType = convertToBufferType(channel["srcbuff"]);
+      info.dstBufferType = convertToBufferType(channel["dstbuff"]);
       info.channelType = convertToChannelType(channel["type"]);
       for (const auto& peer : channel["connectedTo"]) {
         info.connectedPeers.push_back(peer);
diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp
index de1ba4496..70df39c47 100644
--- a/src/include/execution_plan.hpp
+++ b/src/include/execution_plan.hpp
@@ -92,6 +92,9 @@ struct ExecutionPlan::Impl {
   std::unordered_map<int, std::vector<ChannelInfo>> channelInfos;
   std::string name;
   int nranksPerNode;
+  std::unordered_map<int, uint32_t> inputChunks;
+  std::unordered_map<int, uint32_t> outputChunks;
+  std::unordered_map<int, uint32_t> scratchChunks;
 };
 
 }  // namespace mscclpp
diff --git a/test/execution-files/allreduce.json b/test/execution-files/allreduce.json
new file mode 100644
index 000000000..60a8ffe88
--- /dev/null
+++ b/test/execution-files/allreduce.json
@@ -0,0 +1,451 @@
+{
+  "name": "allreduce_pairs",
+  "colletive": "allreduce",
+  "protocol": "Simple",
+  "inplace": true,
+  "nranksPerNode": 8,
+  "gpus": [
+    {
+      "id": 0,
+      "inputChunks": 4,
+      "outputChunks": 0,
+      "scratchChunks": 0,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "signal",
+              "o_cids": [
+                {
+                  "id": 0,
+                  "off": 2
+                }
+              ],
+              "srcbuff": "i",
+              "dstbuff": "i",
+              "ctype": "sm"
+            },
+            {
+              "name": "wait",
+              "i_cids": [
+                {
+                  "id": 0,
+                  "off": 0
+                }
+              ],
+              "srcbuff": "i",
+              "dstbuff": "i",
+              "ctype": "sm"
+            },
+            {
+              "name": "nop",
+              "deps": [
+                {
+                  "tb": 0,
+                  "step": 1
+                }
+              ]
+            },
+            {
+              "name": "rrcs",
+              "i_cids": [
+                {
+                  "id": 0,
+                  "off": 0
+                }
+              ],
+              "o_cids": [
+                {
+                  "id": 0,
+                  "off": 0
+                }
+              ],
+              "srcbuff": "i",
+              "dstbuff": "i",
+              "dstoff": 0,
+              "ctype": "sm",
+              "cnt": 1
+            },
+            {
+              "name": "nop",
+              "deps": [
+                {
+                  "tb": 0,
+                  "step": 3
+                }
+              ]
+            },
+            {
+              "name": "signal",
+              "o_cids": [
+                {
+                  "id": 0,
+                  "off": 0
+                }
+              ],
+              "srcbuff": "i",
+              "dstbuff": "i",
+              "ctype": "sm"
+            },
+            {
+              "name": "wait",
+              "i_cids": [
+                {
+                  "id": 0,
+                  "off": 2
+                }
+              ],
+              "srcbuff": "i",
+              "dstbuff": "i",
+              "ctype": "sm"
+            }
+          ],
+          "channels": [
+            {
+              "src": "i",
+              "dst": "i",
+              "ctype": "sm",
+              "cid": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "signal",
+              "o_cids": [
+                {
+                  "id": 0,
+                  "off": 3
+                }
+              ],
+              "srcbuff": "i",
+              "dstbuff": "i",
+              "ctype": "sm"
+            },
+            {
+              "name": "wait",
+              "i_cids": [
+                {
+                  "id": 0,
+                  "off": 1
+                }
+              ],
+              "srcbuff": "i",
+              "dstbuff": "i",
+              "ctype": "sm"
+            },
+            {
+              "name": "nop",
+              "deps": [
+                {
+                  "tb": 1,
+                  "step": 1
+                }
+              ]
+            },
+            {
+              "name": "rrcs",
+              "i_cids": [
+                {
+                  "id": 0,
+                  "off": 1
+                }
+              ],
+              "o_cids": [
+                {
+                  "id": 0,
+                  "off": 0
+                }
+              ],
+              "srcbuff": "i",
+              "dstbuff": "i",
+              "dstoff": 1,
+              "ctype": "sm",
+              "cnt": 1
+            },
+            {
+              "name": "nop",
+              "deps": [
+                {
+                  "tb": 1,
+                  "step": 3
+                }
+              ]
+            },
+            {
+              "name": "signal",
+              "o_cids": [
+                {
+                  "id": 0,
+                  "off": 0
+                }
+              ],
+              "srcbuff": "i",
+              "dstbuff": "i",
+              "ctype": "sm"
+            },
+            {
+              "name": "wait",
+              "i_cids": [
+                {
+                  "id": 0,
+                  "off": 2
+                }
+              ],
+              "srcbuff": "i",
+              "dstbuff": "i",
+              "ctype": "sm"
+            }
+          ],
+          "channels": [
+            {
+              "src": "i",
+              "dst": "i",
+              "ctype": "sm",
+              "cid": [
+                1
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "srcbuff": "i",
+          "dstbuff": "i",
+          "type": "sm",
+          "connectedTo": [
+            1,
+            1
+          ]
+        }
+      ]
+    },
+    {
+      "id": 1,
+      "inputChunks": 4,
+      "outputChunks": 0,
+      "scratchChunks": 0,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "signal",
+              "o_cids": [
+                {
+                  "id": 0,
+                  "off": 0
+                }
+              ],
+              "srcbuff": "i",
+              "dstbuff": "i",
+              "ctype": "sm"
+            },
+            {
+              "name": "wait",
+              "i_cids": [
+                {
+                  "id": 0,
+                  "off": 2
+                }
+              ],
+              "srcbuff": "i",
+              "dstbuff": "i",
+              "ctype": "sm"
+            },
+            {
+              "name": "nop",
+              "deps": [
+                {
+                  "tb": 0,
+                  "step": 1
+                }
+              ]
+            },
+            {
+              "name": "rrcs",
+              "i_cids": [
+                {
+                  "id": 0,
+                  "off": 2
+                }
+              ],
+              "o_cids": [
+                {
+                  "id": 0,
+                  "off": 2
+                }
+              ],
+              "srcbuff": "i",
+              "dstbuff": "i",
+              "dstoff": 2,
+              "ctype": "sm",
+              "cnt": 1
+            },
+            {
+              "name": "nop",
+              "deps": [
+                {
+                  "tb": 0,
+                  "step": 3
+                }
+              ]
+            },
+            {
+              "name": "signal",
+              "o_cids": [
+                {
+                  "id": 0,
+                  "off": 2
+                }
+              ],
+              "srcbuff": "i",
+              "dstbuff": "i",
+              "ctype": "sm"
+            },
+            {
+              "name": "wait",
+              "i_cids": [
+                {
+                  "id": 0,
+                  "off": 0
+                }
+              ],
+              "srcbuff": "i",
+              "dstbuff": "i",
+              "ctype": "sm"
+            }
+          ],
+          "channels": [
+            {
+              "src": "i",
+              "dst": "i",
+              "ctype": "sm",
+              "cid": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "signal",
+              "o_cids": [
+                {
+                  "id": 0,
+                  "off": 1
+                }
+              ],
+              "srcbuff": "i",
+              "dstbuff": "i",
+              "ctype": "sm"
+            },
+            {
+              "name": "wait",
+              "i_cids": [
+                {
+                  "id": 0,
+                  "off": 3
+                }
+              ],
+              "srcbuff": "i",
+              "dstbuff": "i",
+              "ctype": "sm"
+            },
+            {
+              "name": "nop",
+              "deps": [
+                {
+                  "tb": 1,
+                  "step": 1
+                }
+              ]
+            },
+            {
+              "name": "rrcs",
+              "i_cids": [
+                {
+                  "id": 0,
+                  "off": 3
+                }
+              ],
+              "o_cids": [
+                {
+                  "id": 0,
+                  "off": 2
+                }
+              ],
+              "srcbuff": "i",
+              "dstbuff": "i",
+              "dstoff": 3,
+              "ctype": "sm",
+              "cnt": 1
+            },
+            {
+              "name": "nop",
+              "deps": [
+                {
+                  "tb": 1,
+                  "step": 3
+                }
+              ]
+            },
+            {
+              "name": "signal",
+              "o_cids": [
+                {
+                  "id": 0,
+                  "off": 2
+                }
+              ],
+              "srcbuff": "i",
+              "dstbuff": "i",
+              "ctype": "sm"
+            },
+            {
+              "name": "wait",
+              "i_cids": [
+                {
+                  "id": 0,
+                  "off": 0
+                }
+              ],
+              "srcbuff": "i",
+              "dstbuff": "i",
+              "ctype": "sm"
+            }
+          ],
+          "channels": [
+            {
+              "src": "i",
+              "dst": "i",
+              "ctype": "sm",
+              "cid": [
+                1
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "srcbuff": "i",
+          "dstbuff": "i",
+          "type": "sm",
+          "connectedTo": [
+            0,
+            0
+          ]
+        }
+      ]
+    }
+  ]
+}
diff --git a/test/executor_test.cc b/test/executor_test.cc
index 7059631ab..9a8bcb72e 100644
--- a/test/executor_test.cc
+++ b/test/executor_test.cc
@@ -3,6 +3,8 @@
 #include <fstream>
 #include <mscclpp/executor.hpp>
 
+const std::string MSCCLPP_ROOT_PATH = "/root/mscclpp";
+
 int main() {
   int rank;
   int world_size;
@@ -16,9 +18,12 @@ int main() {
   }
   MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
   bootstrap->initialize(id);
+  // sleep 10s
+  // std::this_thread::sleep_for(std::chrono::seconds(20));
   auto comm = std::make_shared<mscclpp::Communicator>(bootstrap);
   std::shared_ptr<mscclpp::Executor> executor = std::make_shared<mscclpp::Executor>(comm);
-  std::ifstream file("execution_plan.json");
+
+  std::ifstream file(MSCCLPP_ROOT_PATH + "/test/execution-files/allreduce.json");
   mscclpp::ExecutionPlan plan(file);
   std::shared_ptr<char> sendbuff = mscclpp::allocExtSharedCuda<char>(1024);
   std::shared_ptr<char> recvbuff = mscclpp::allocExtSharedCuda<char>(1024);

From b34c9e85056c40211796900e7ddeef4176139575 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Tue, 2 Apr 2024 06:58:16 +0000
Subject: [PATCH 12/51] WIP

---
 include/mscclpp/executor.hpp   |  3 ++-
 src/executor/execution_plan.cc |  6 +++++-
 src/executor/executor.cc       | 21 +++++++++++----------
 test/executor_test.cc          |  2 +-
 4 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/include/mscclpp/executor.hpp b/include/mscclpp/executor.hpp
index 895fc03d8..bf09a0d6e 100644
--- a/include/mscclpp/executor.hpp
+++ b/include/mscclpp/executor.hpp
@@ -29,7 +29,8 @@ class Executor {
   Executor& operator=(const Executor&) = delete;
   ~Executor();
 
-  void execute(void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, const ExecutionPlan& plan);
+  void execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize,
+               const ExecutionPlan& plan);
 
  private:
   struct Impl;
diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc
index a962e956f..c6b332fb8 100644
--- a/src/executor/execution_plan.cc
+++ b/src/executor/execution_plan.cc
@@ -41,7 +41,11 @@ std::vector<int> ExecutionPlan::Impl::getConnectedPeers(int rank) const {
 }
 
 std::vector<BufferType> ExecutionPlan::Impl::getConnectedBufferTypes(int rank) const {
-  return std::vector<BufferType>();
+  std::set<BufferType> bufferTypes;
+  for (const auto& info : this->channelInfos.at(rank)) {
+    bufferTypes.insert(info.dstBufferType);
+  }
+  return std::vector<BufferType>(bufferTypes.begin(), bufferTypes.end());
 }
 size_t ExecutionPlan::Impl::getScratchBufferSize(int rank, size_t inputSize) const {
   return inputSize / this->inputChunks.at(rank) * this->scratchChunks.at(rank);
diff --git a/src/executor/executor.cc b/src/executor/executor.cc
index 4a3e367e3..94565cdcb 100644
--- a/src/executor/executor.cc
+++ b/src/executor/executor.cc
@@ -53,6 +53,7 @@ static const mscclpp::Transport IBs[] = {mscclpp::Transport::IB0, mscclpp::Trans
 namespace mscclpp {
 
 struct ExecutionContext {
+  std::unordered_map<int, std::shared_ptr<Connection>> connections;
   std::unordered_map<std::pair<BufferType, int>, mscclpp::RegisteredMemory> registeredMemories;
   std::vector<std::shared_ptr<mscclpp::SmDevice2DeviceSemaphore>> smSemaphores;
   std::vector<mscclpp::SemaphoreId> proxySemaphores;
@@ -65,7 +66,6 @@ struct ExecutionContext {
 
 struct Executor::Impl {
   std::shared_ptr<Communicator> comm;
-  std::unordered_map<int, std::shared_ptr<Connection>> connections;
   std::shared_ptr<ProxyService> proxyService;
   std::unordered_map<ExecutionContextKey, ExecutionContext> contexts;
 
@@ -99,7 +99,7 @@ struct Executor::Impl {
     }
     this->comm->setup();
     for (size_t i = 0; i < connectionFutures.size(); i++) {
-      this->connections[connectedPeers[i]] = connectionFutures[i].get();
+      context.connections[connectedPeers[i]] = connectionFutures[i].get();
     }
   }
 
@@ -113,7 +113,7 @@ struct Executor::Impl {
           flags |= Transport::CudaIpc;
         } else if (info.channelType == ChannelType::PROXY) {
           for (int peer : info.connectedPeers) {
-            if (inSameNode(rank, peer, nranksPerNode)) {
+            if (!inSameNode(rank, peer, nranksPerNode)) {
               flags |= IBs[rank % nranksPerNode];
             }
           }
@@ -172,10 +172,11 @@ struct Executor::Impl {
       for (ChannelInfo& info : channelInfos) {
         for (int peer : info.connectedPeers) {
           if (channelType == ChannelType::SM) {
-            smSemaphores.push_back(std::make_shared<SmDevice2DeviceSemaphore>(*this->comm, this->connections.at(peer)));
+            smSemaphores.push_back(
+                std::make_shared<SmDevice2DeviceSemaphore>(*this->comm, context.connections.at(peer)));
           } else if (channelType == ChannelType::PROXY) {
             proxySemaphores.push_back(
-                this->proxyService->buildAndAddSemaphore(*this->comm, this->connections.at(peer)));
+                this->proxyService->buildAndAddSemaphore(*this->comm, context.connections.at(peer)));
           }
         }
       }
@@ -205,11 +206,11 @@ struct Executor::Impl {
         RegisteredMemory localMemory = this->comm->registerMemory(src, sendBufferSize, transport);
         for (int peer : info.connectedPeers) {
           if (channelType == ChannelType::SM) {
-            context.smChannels.emplace_back(smSemaphores[index], context.registeredMemories[{info.dstBufferType, peer}],
-                                            src, nullptr);
+            context.smChannels.emplace_back(context.smSemaphores[index],
+                                            context.registeredMemories[{info.dstBufferType, peer}], src, nullptr);
           } else if (channelType == ChannelType::PROXY) {
             context.proxyChannels.emplace_back(
-                this->proxyService->proxyChannel(proxySemaphores[index]),
+                this->proxyService->proxyChannel(context.proxySemaphores[index]),
                 this->proxyService->addMemory(context.registeredMemories[{info.dstBufferType, peer}]),
                 this->proxyService->addMemory(localMemory));
           }
@@ -223,10 +224,10 @@ struct Executor::Impl {
 
 Executor::Executor(std::shared_ptr<Communicator> comm) : impl_(std::make_unique<Impl>(comm)) {}
 
-void Executor::execute(void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize,
+void Executor::execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize,
                        const ExecutionPlan& plan) {
   ExecutionContext context =
-      this->impl_->setupExecutionContext(0, sendbuff, recvBuff, sendBuffSize, recvBuffSize, plan);
+      this->impl_->setupExecutionContext(rank, sendbuff, recvBuff, sendBuffSize, recvBuffSize, plan);
   this->impl_->launchKernel(context);
 }
 
diff --git a/test/executor_test.cc b/test/executor_test.cc
index 9a8bcb72e..35c4cce2b 100644
--- a/test/executor_test.cc
+++ b/test/executor_test.cc
@@ -27,7 +27,7 @@ int main() {
   mscclpp::ExecutionPlan plan(file);
   std::shared_ptr<char> sendbuff = mscclpp::allocExtSharedCuda<char>(1024);
   std::shared_ptr<char> recvbuff = mscclpp::allocExtSharedCuda<char>(1024);
-  executor->execute(sendbuff.get(), recvbuff.get(), 1024, 1024, plan);
+  executor->execute(rank, sendbuff.get(), recvbuff.get(), 1024, 1024, plan);
 
   MPI_Finalize();
   return 0;

From faef1e4425dc7080a18496156393a8fab6bae7f8 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Tue, 2 Apr 2024 07:23:52 +0000
Subject: [PATCH 13/51] WIP

---
 include/mscclpp/executor.hpp        |  2 +-
 src/executor/execution_plan.cc      |  1 -
 src/executor/executor.cc            | 17 ++++++++++-------
 src/include/execution_plan.hpp      |  1 -
 test/execution-files/allreduce.json |  1 -
 test/executor_test.cc               |  2 +-
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/include/mscclpp/executor.hpp b/include/mscclpp/executor.hpp
index bf09a0d6e..1efd2a747 100644
--- a/include/mscclpp/executor.hpp
+++ b/include/mscclpp/executor.hpp
@@ -24,7 +24,7 @@ class ExecutionPlan {
 
 class Executor {
  public:
-  Executor(std::shared_ptr<Communicator> comm);
+  Executor(std::shared_ptr<Communicator> comm, int nranksPerNode);
   Executor(const Executor&) = delete;
   Executor& operator=(const Executor&) = delete;
   ~Executor();
diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc
index c6b332fb8..9b6c284c3 100644
--- a/src/executor/execution_plan.cc
+++ b/src/executor/execution_plan.cc
@@ -82,7 +82,6 @@ void ExecutionPlan::Impl::loadExecutionPlan(std::ifstream& file) {
 
   json obj = json::parse(file);
   this->name = obj["name"];
-  this->nranksPerNode = obj["nranksPerNode"];
   auto gpus = obj["gpus"];
   for (const auto& gpu : gpus) {
     int rank = gpu["id"];
diff --git a/src/executor/executor.cc b/src/executor/executor.cc
index 94565cdcb..47ee0be70 100644
--- a/src/executor/executor.cc
+++ b/src/executor/executor.cc
@@ -65,11 +65,14 @@ struct ExecutionContext {
 };
 
 struct Executor::Impl {
+  int nranksPerNode;
   std::shared_ptr<Communicator> comm;
   std::shared_ptr<ProxyService> proxyService;
   std::unordered_map<ExecutionContextKey, ExecutionContext> contexts;
 
-  Impl(std::shared_ptr<Communicator> comm) : comm(comm) { this->proxyService = std::make_shared<ProxyService>(); }
+  Impl(std::shared_ptr<Communicator> comm, int nranksPerNode) : nranksPerNode(nranksPerNode), comm(comm) {
+    this->proxyService = std::make_shared<ProxyService>();
+  }
   ~Impl() = default;
 
   ExecutionContext setupExecutionContext(int rank, void* sendbuff, void* recvbuff, size_t sendBufferSize,
@@ -93,8 +96,8 @@ struct Executor::Impl {
     std::vector<int> connectedPeers = plan.impl_->getConnectedPeers(rank);
     std::vector<mscclpp::NonblockingFuture<std::shared_ptr<mscclpp::Connection>>> connectionFutures;
     for (int peer : connectedPeers) {
-      Transport transport = inSameNode(rank, peer, plan.impl_->nranksPerNode) ? Transport::CudaIpc
-                                                                              : IBs[rank % plan.impl_->nranksPerNode];
+      Transport transport =
+          inSameNode(rank, peer, this->nranksPerNode) ? Transport::CudaIpc : IBs[rank % this->nranksPerNode];
       connectionFutures.push_back(this->comm->connectOnSetup(peer, 0, transport));
     }
     this->comm->setup();
@@ -105,7 +108,6 @@ struct Executor::Impl {
 
   void setupRegisteredMemories(ExecutionContext& context, void* sendbuff, void* recvbuff, size_t sendBufferSize,
                                size_t recvBufferSize, int rank, const ExecutionPlan& plan) {
-    int nranksPerNode = plan.impl_->nranksPerNode;
     auto getTransportFlags = [&](std::vector<ChannelInfo>& infos, int rank) {
       TransportFlags flags;
       for (ChannelInfo& info : infos) {
@@ -113,8 +115,8 @@ struct Executor::Impl {
           flags |= Transport::CudaIpc;
         } else if (info.channelType == ChannelType::PROXY) {
           for (int peer : info.connectedPeers) {
-            if (!inSameNode(rank, peer, nranksPerNode)) {
-              flags |= IBs[rank % nranksPerNode];
+            if (!inSameNode(rank, peer, this->nranksPerNode)) {
+              flags |= IBs[rank % this->nranksPerNode];
             }
           }
         }
@@ -222,7 +224,8 @@ struct Executor::Impl {
   void launchKernel(ExecutionContext& context) {}
 };
 
-Executor::Executor(std::shared_ptr<Communicator> comm) : impl_(std::make_unique<Impl>(comm)) {}
+Executor::Executor(std::shared_ptr<Communicator> comm, int nranksPerNode)
+    : impl_(std::make_unique<Impl>(comm, nranksPerNode)) {}
 
 void Executor::execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize,
                        const ExecutionPlan& plan) {
diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp
index 70df39c47..759f3ebc7 100644
--- a/src/include/execution_plan.hpp
+++ b/src/include/execution_plan.hpp
@@ -91,7 +91,6 @@ struct ExecutionPlan::Impl {
   std::vector<std::vector<Operation>> operations;
   std::unordered_map<int, std::vector<ChannelInfo>> channelInfos;
   std::string name;
-  int nranksPerNode;
   std::unordered_map<int, uint32_t> inputChunks;
   std::unordered_map<int, uint32_t> outputChunks;
   std::unordered_map<int, uint32_t> scratchChunks;
diff --git a/test/execution-files/allreduce.json b/test/execution-files/allreduce.json
index 60a8ffe88..58db26c08 100644
--- a/test/execution-files/allreduce.json
+++ b/test/execution-files/allreduce.json
@@ -3,7 +3,6 @@
   "colletive": "allreduce",
   "protocol": "Simple",
   "inplace": true,
-  "nranksPerNode": 8,
   "gpus": [
     {
       "id": 0,
diff --git a/test/executor_test.cc b/test/executor_test.cc
index 35c4cce2b..ccc0356a1 100644
--- a/test/executor_test.cc
+++ b/test/executor_test.cc
@@ -21,7 +21,7 @@ int main() {
   // sleep 10s
   // std::this_thread::sleep_for(std::chrono::seconds(20));
   auto comm = std::make_shared<mscclpp::Communicator>(bootstrap);
-  std::shared_ptr<mscclpp::Executor> executor = std::make_shared<mscclpp::Executor>(comm);
+  std::shared_ptr<mscclpp::Executor> executor = std::make_shared<mscclpp::Executor>(comm, 8 /*nranksPerNode*/);
 
   std::ifstream file(MSCCLPP_ROOT_PATH + "/test/execution-files/allreduce.json");
   mscclpp::ExecutionPlan plan(file);

From a80bcee1f62ef4f5b51d67851bb6304b60e1066d Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Wed, 3 Apr 2024 07:12:07 +0000
Subject: [PATCH 14/51] build pass

---
 include/mscclpp/executor.hpp        |   2 +-
 src/executor/execution_plan.cc      | 196 ++++++++++++++++++++++-----
 src/executor/executor.cc            |  29 +++-
 src/include/execution_plan.hpp      |  71 +++++++---
 test/execution-files/allreduce.json | 200 +++++++++++++++++++---------
 test/executor_test.cc               |   4 +-
 6 files changed, 389 insertions(+), 113 deletions(-)

diff --git a/include/mscclpp/executor.hpp b/include/mscclpp/executor.hpp
index 1efd2a747..076238336 100644
--- a/include/mscclpp/executor.hpp
+++ b/include/mscclpp/executor.hpp
@@ -12,7 +12,7 @@ namespace mscclpp {
 
 class ExecutionPlan {
  public:
-  ExecutionPlan(std::ifstream& file);
+  ExecutionPlan(std::string planPath);
   ~ExecutionPlan() = default;
 
  private:
diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc
index 9b6c284c3..0dff4adbd 100644
--- a/src/executor/execution_plan.cc
+++ b/src/executor/execution_plan.cc
@@ -4,7 +4,6 @@
 #include "execution_plan.hpp"
 
 #include <fstream>
-#include <nlohmann/json.hpp>
 #include <set>
 
 namespace {
@@ -14,17 +13,67 @@ std::vector<T> filter(const std::vector<T>& vec, Predicate pred) {
   std::copy_if(vec.begin(), vec.end(), std::back_inserter(filtered), pred);
   return filtered;
 }
+
+auto getOpType = [](const std::string& str) {
+  if (str == "nop") {
+    return mscclpp::OperationType::BARRIER;
+  } else if (str == "put") {
+    return mscclpp::OperationType::PUT;
+  } else if (str == "get") {
+    return mscclpp::OperationType::GET;
+  } else if (str == "copy") {
+    return mscclpp::OperationType::COPY;
+  } else if (str == "signal") {
+    return mscclpp::OperationType::SIGNAL;
+  } else if (str == "wait") {
+    return mscclpp::OperationType::WAIT;
+  } else if (str == "flush") {
+    return mscclpp::OperationType::FLUSH;
+  } else if (str == "reduce") {
+    return mscclpp::OperationType::REDUCE;
+  } else if (str == "read_reduce_copy") {
+    return mscclpp::OperationType::READ_REDUCE_COPY;
+  } else if (str == "read_reduce_copy_put") {
+    return mscclpp::OperationType::READ_REDUCE_COPY_PUT;
+  } else {
+    throw std::runtime_error("Invalid operation type");
+  }
+};
+
+auto convertToBufferType = [](const std::string& str) {
+  if (str == "i") {
+    return mscclpp::BufferType::INPUT;
+  } else if (str == "o") {
+    return mscclpp::BufferType::OUTPUT;
+  } else if (str == "s") {
+    return mscclpp::BufferType::SCRATCH;
+  } else {
+    throw std::runtime_error("Invalid buffer type");
+  }
+};
+
+auto convertToChannelType = [](const std::string& str) {
+  if (str == "sm") {
+    return mscclpp::ChannelType::SM;
+  } else if (str == "proxy") {
+    return mscclpp::ChannelType::PROXY;
+  } else {
+    throw std::runtime_error("Invalid channel type");
+  }
+};
+
 }  // namespace
 
 namespace mscclpp {
 using json = nlohmann::json;
 
-ExecutionPlan::Impl::Impl(std::ifstream& file) { this->loadExecutionPlan(file); }
+ExecutionPlan::Impl::Impl(std::string planPath) : planPath(planPath) {}
 
 std::vector<ChannelInfo> ExecutionPlan::Impl::getChannelInfos(int rank, ChannelType channelType) const {
   auto pred = [channelType](const ChannelInfo& info) { return info.channelType == channelType; };
   return filter(this->channelInfos.at(rank), pred);
 }
+
 std::vector<ChannelInfo> ExecutionPlan::Impl::getChannelInfos(int rank, BufferType dstBufferType) const {
   auto pred = [dstBufferType](const ChannelInfo& info) { return info.dstBufferType == dstBufferType; };
   return filter(this->channelInfos.at(rank), pred);
@@ -50,44 +99,37 @@ std::vector<BufferType> ExecutionPlan::Impl::getConnectedBufferTypes(int rank) c
 size_t ExecutionPlan::Impl::getScratchBufferSize(int rank, size_t inputSize) const {
   return inputSize / this->inputChunks.at(rank) * this->scratchChunks.at(rank);
 }
-std::vector<Operation> ExecutionPlan::Impl::getOperations(int rank, int threadblock) {
-  return std::vector<Operation>();
-}
-std::pair<int, int> ExecutionPlan::Impl::getThreadBlockChannelRange(int rank, int threadblock, BufferType srcBufferType,
-                                                                    BufferType dstBufferType, ChannelType channelType) {
-  return std::make_pair(0, 0);
+std::vector<Operation> ExecutionPlan::Impl::getOperations(int rank, int threadblock) const {
+  return this->operations.at(rank)[threadblock];
 }
 
-void ExecutionPlan::Impl::loadExecutionPlan(std::ifstream& file) {
-  auto convertToBufferType = [](const std::string& str) {
-    if (str == "i") {
-      return BufferType::INPUT;
-    } else if (str == "o") {
-      return BufferType::OUTPUT;
-    } else if (str == "s") {
-      return BufferType::SCRATCH;
-    } else {
-      throw std::runtime_error("Invalid buffer type");
-    }
-  };
-  auto convertToChannelType = [](const std::string& str) {
-    if (str == "sm") {
-      return ChannelType::SM;
-    } else if (str == "proxy") {
-      return ChannelType::PROXY;
-    } else {
-      throw std::runtime_error("Invalid channel type");
-    }
-  };
+int ExecutionPlan::Impl::getThreadblockCount(int rank) const { return this->operations.at(rank).size(); }
 
+void ExecutionPlan::Impl::loadExecutionPlan(size_t inputSize) {
+  std::ifstream file(this->planPath);
   json obj = json::parse(file);
   this->name = obj["name"];
   auto gpus = obj["gpus"];
+
   for (const auto& gpu : gpus) {
     int rank = gpu["id"];
     this->inputChunks[rank] = gpu["inputChunks"];
     this->outputChunks[rank] = gpu["outputChunks"];
     this->scratchChunks[rank] = gpu["scratchChunks"];
+  }
+  this->setupChannels(gpus);
+
+  uint32_t maxInputChunks = 0;
+  for (const auto& [rank, chunks] : this->inputChunks) {
+    maxInputChunks = std::max(maxInputChunks, chunks);
+  }
+  this->chunkSize = inputSize / maxInputChunks;
+  this->setupOperations(gpus);
+}
+
+void ExecutionPlan::Impl::setupChannels(const json& gpus) {
+  for (const auto& gpu : gpus) {
+    int rank = gpu["id"];
     std::vector<ChannelInfo> channelInfos;
     for (const auto& channel : gpu["channels"]) {
       ChannelInfo info;
@@ -101,8 +143,102 @@ void ExecutionPlan::Impl::loadExecutionPlan(std::ifstream& file) {
     }
     this->channelInfos[rank] = channelInfos;
   }
+
+  // setup threadblockChannelMap
+  for (const auto& gpu : gpus) {
+    int rank = gpu["id"];
+    auto channelTypes = {ChannelType::SM, ChannelType::PROXY};
+    std::unordered_map<ChannelKey, std::vector<int>> channelMap;
+    for (auto channelType : channelTypes) {
+      const std::vector<ChannelInfo> channelInfos = this->getChannelInfos(rank, channelType);
+      for (size_t i = 0; i < channelInfos.size(); i++) {
+        const ChannelInfo& info = channelInfos[i];
+        ChannelKey key = {info.srcBufferType, info.dstBufferType, info.channelType};
+        channelMap[key].push_back(i);
+      }
+    }
+    for (const auto& threadblock : gpu["threadblocks"]) {
+      for (const auto& channel : threadblock["channels"]) {
+        ChannelType channelType = convertToChannelType(channel["ctype"]);
+        ChannelKey key = {convertToBufferType(channel["src"]), convertToBufferType(channel["dst"]), channelType};
+        for (int id : channel["cids"]) {
+          if (channelType == ChannelType::SM) {
+            this->threadblockSMChannelMap[rank][threadblock["id"]].emplace_back(channelMap[key][id], key);
+          } else if (channelType == ChannelType::PROXY) {
+            this->threadblockProxyChannelMap[rank][threadblock["id"]].emplace_back(channelMap[key][id], key);
+          }
+        }
+      }
+    }
+  }
+}
+
+void ExecutionPlan::Impl::setupOperations(const json& gpus) {
+  // setup threadblocks and operations
+  for (const auto& gpu : gpus) {
+    int rank = gpu["id"];
+    for (const auto& threadblock : gpu["threadblocks"]) {
+      std::unordered_map<ChannelKey, std::vector<int>> channelIndexes;
+      std::vector<Operation> ops;
+      int threadblockId = threadblock["id"];
+      const auto& smChannels = this->threadblockSMChannelMap[rank][threadblockId];
+      const auto& proxyChannels = this->threadblockProxyChannelMap[rank][threadblockId];
+      for (size_t i = 0; i < smChannels.size(); i++) {
+        const auto& [_, key] = smChannels[i];
+        channelIndexes[key].push_back(i);
+      }
+      for (size_t i = 0; i < proxyChannels.size(); i++) {
+        const auto& [_, key] = proxyChannels[i];
+        channelIndexes[key].push_back(i);
+      }
+      for (const auto& op : threadblock["ops"]) {
+        Operation operation = {};
+        operation.type = static_cast<mscclpp::OperationType>(getOpType(op["name"]));
+        if (op.contains("ctype")) {
+          operation.channelType = convertToChannelType(op["ctype"]);
+        }
+        if (op.contains("i_cids")) {
+          operation.nInputChannels = op["i_cids"].size();
+        }
+        if (op.contains("o_cids")) {
+          operation.nOutputChannels = op["o_cids"].size();
+        }
+        for (int i = 0; i < operation.nInputChannels; i++) {
+          BufferType srcBufferType = convertToBufferType(op["i_buff"][i]["src"]);
+          BufferType dstBufferType = convertToBufferType(op["i_buff"][i]["dst"]);
+          operation.inputChannelIndex[i] =
+              channelIndexes[{srcBufferType, dstBufferType, operation.channelType}][op["i_cids"][i]["id"]];
+          operation.inputOffset[i] = this->chunkSize * (int)op["i_cids"][i]["offset"];
+        }
+        for (int i = 0; i < operation.nOutputChannels; i++) {
+          BufferType srcBufferType = convertToBufferType(op["o_buff"][i]["src"]);
+          BufferType dstBufferType = convertToBufferType(op["o_buff"][i]["dst"]);
+          operation.outputChannelIndex[i] =
+              channelIndexes[{srcBufferType, dstBufferType, operation.channelType}][op["o_cids"][i]["id"]];
+          operation.outputOffset[i] = this->chunkSize * (int)op["o_cids"][i]["offset"];
+        }
+        if (op.contains("srcbuff")) {
+          operation.srcBufferType = convertToBufferType(op["srcbuff"]);
+        }
+        if (op.contains("srcoff")) {
+          operation.srcOffset = (int)op["srcoff"] * this->chunkSize;
+        }
+        if (op.contains("dstbuff")) {
+          operation.dstBufferType = convertToBufferType(op["dstbuff"]);
+        }
+        if (op.contains("dstoff")) {
+          operation.dstOffset = (int)op["dstoff"] * this->chunkSize;
+        }
+        if (op.contains("cnt")) {
+          operation.size = this->chunkSize * (int)op["cnt"];
+        }
+        ops.push_back(operation);
+      }
+      this->operations[rank].push_back(ops);
+    }
+  }
 }
 
-ExecutionPlan::ExecutionPlan(std::ifstream& file) : impl_(std::make_shared<Impl>(file)) {}
+ExecutionPlan::ExecutionPlan(std::string planPath) : impl_(std::make_shared<Impl>(planPath)) {}
 
 }  // namespace mscclpp
diff --git a/src/executor/executor.cc b/src/executor/executor.cc
index 47ee0be70..445ebccad 100644
--- a/src/executor/executor.cc
+++ b/src/executor/executor.cc
@@ -60,6 +60,7 @@ struct ExecutionContext {
   std::vector<mscclpp::SmChannel> smChannels;
   std::vector<mscclpp::SimpleProxyChannel> proxyChannels;
   std::vector<DeviceExecutionPlan> deviceExecutionPlans;
+  std::vector<std::vector<Operation>> operations;
   std::shared_ptr<char> scratchBuffer;
   size_t scratchBufferSize;
 };
@@ -81,6 +82,8 @@ struct Executor::Impl {
     if (this->contexts.find(key) != this->contexts.end()) {
       return this->contexts[key];
     }
+    plan.impl_->loadExecutionPlan(sendBufferSize);
+
     ExecutionContext context;
     size_t scratchBufferSize = plan.impl_->getScratchBufferSize(rank, sendBufferSize);
     std::shared_ptr<char> scratchBuffer = allocExtSharedCuda<char>(scratchBufferSize);
@@ -89,6 +92,7 @@ struct Executor::Impl {
     this->setupConnections(context, rank, plan);
     this->setupRegisteredMemories(context, sendbuff, recvbuff, sendBufferSize, recvBufferSize, rank, plan);
     this->setupChannels(context, sendbuff, recvbuff, sendBufferSize, rank, plan);
+    this->setupDeviceExecutionPlan(context, rank, plan);
     return context;
   }
 
@@ -221,7 +225,30 @@ struct Executor::Impl {
     }
   }
 
-  void launchKernel(ExecutionContext& context) {}
+  void setupDeviceExecutionPlan(ExecutionContext& context, int rank, const ExecutionPlan& plan) {
+    std::vector<DeviceExecutionPlan> deviceExecutionPlans;
+    for (int threadblock = 0; threadblock < plan.impl_->getThreadblockCount(rank); threadblock++) {
+      DeviceExecutionPlan deviceExecutionPlan;
+      std::vector<Operation> ops = plan.impl_->getOperations(rank, threadblock);
+      context.operations.emplace_back(std::move(ops));
+      deviceExecutionPlan.nOperations = ops.size();
+      deviceExecutionPlan.nSmChannels = plan.impl_->threadblockSMChannelMap.at(rank).at(threadblock).size();
+      deviceExecutionPlan.nProxyChannels = plan.impl_->threadblockProxyChannelMap.at(rank).at(threadblock).size();
+      for (const auto& [index, key] : plan.impl_->threadblockSMChannelMap.at(rank).at(threadblock)) {
+        deviceExecutionPlan.channels.smChannels[index] = mscclpp::deviceHandle(context.smChannels[index]);
+      }
+      for (const auto& [index, key] : plan.impl_->threadblockProxyChannelMap.at(rank).at(threadblock)) {
+        deviceExecutionPlan.channels.proxyChannels[index] = mscclpp::deviceHandle(context.proxyChannels[index]);
+      }
+      deviceExecutionPlans.push_back(deviceExecutionPlan);
+    }
+    context.deviceExecutionPlans = std::move(deviceExecutionPlans);
+  }
+
+  void launchKernel(ExecutionContext& context) {
+    // copy context to shared memory
+    // launch kernel
+  }
 };
 
 Executor::Executor(std::shared_ptr<Communicator> comm, int nranksPerNode)
diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp
index 759f3ebc7..768799846 100644
--- a/src/include/execution_plan.hpp
+++ b/src/include/execution_plan.hpp
@@ -8,11 +8,46 @@
 #include <mscclpp/executor.hpp>
 #include <mscclpp/proxy_channel.hpp>
 #include <mscclpp/sm_channel.hpp>
+#include <nlohmann/json.hpp>
 #include <string>
 #include <unordered_map>
 
 namespace mscclpp {
 
+enum class BufferType {
+  INPUT,
+  OUTPUT,
+  SCRATCH,
+};
+
+enum class ChannelType {
+  SM,
+  PROXY,
+};
+
+struct ChannelKey {
+  BufferType srcBufferType;
+  BufferType dstBufferType;
+  ChannelType channelType;
+  bool operator==(const ChannelKey& other) const {
+    return srcBufferType == other.srcBufferType && dstBufferType == other.dstBufferType &&
+           channelType == other.channelType;
+  }
+};
+}  // namespace mscclpp
+
+namespace std {
+template <>
+struct hash<mscclpp::ChannelKey> {
+  std::size_t operator()(const mscclpp::ChannelKey& key) const {
+    return std::hash<int>()(static_cast<int>(key.srcBufferType)) ^
+           std::hash<int>()(static_cast<int>(key.dstBufferType)) ^ std::hash<int>()(static_cast<int>(key.channelType));
+  }
+};
+}  // namespace std
+
+namespace mscclpp {
+
 constexpr int MAX_CHANNEL = 24;
 constexpr int MAX_CHANNEL_PER_OPERATION = 8;
 
@@ -29,17 +64,6 @@ enum class OperationType {
   READ_REDUCE_COPY_PUT,
 };
 
-enum class ChannelType {
-  SM,
-  PROXY,
-};
-
-enum class BufferType {
-  INPUT,
-  OUTPUT,
-  SCRATCH,
-};
-
 struct ChannelInfo {
   BufferType srcBufferType;
   BufferType dstBufferType;
@@ -55,10 +79,14 @@ struct Channels {
 struct Operation {
   OperationType type;
   ChannelType channelType;
+  uint16_t nInputChannels;
+  uint16_t nOutputChannels;
   uint16_t inputChannelIndex[MAX_CHANNEL_PER_OPERATION];
   uint16_t outputChannelIndex[MAX_CHANNEL_PER_OPERATION];
   size_t inputOffset[MAX_CHANNEL_PER_OPERATION];
   size_t outputOffset[MAX_CHANNEL_PER_OPERATION];
+  BufferType srcBufferType;
+  BufferType dstBufferType;
   size_t srcOffset;
   size_t dstOffset;
   size_t size;
@@ -74,7 +102,7 @@ struct DeviceExecutionPlan {
 
 struct ExecutionPlan::Impl {
  public:
-  Impl(std::ifstream& file);
+  Impl(std::string planPath);
   ~Impl() = default;
 
   std::vector<ChannelInfo> getChannelInfos(int rank, ChannelType channelType) const;
@@ -82,18 +110,25 @@ struct ExecutionPlan::Impl {
   std::vector<int> getConnectedPeers(int rank) const;
   std::vector<BufferType> getConnectedBufferTypes(int rank) const;
   size_t getScratchBufferSize(int rank, size_t inputSize) const;
-  std::vector<Operation> getOperations(int rank, int threadblock);
-  std::pair<int, int> getThreadBlockChannelRange(int rank, int threadblock, BufferType srcBufferType,
-                                                 BufferType dstBufferType, ChannelType channelType);
-  void loadExecutionPlan(std::ifstream& file);
+  std::vector<Operation> getOperations(int rank, int threadblock) const;
+  int getThreadblockCount(int rank) const;
+
+  void loadExecutionPlan(size_t inputSize);
+  void setupChannels(const nlohmann::json& gpus);
+  void setupOperations(const nlohmann::json& gpus);
 
-  // operations for [rank][threadblock]
-  std::vector<std::vector<Operation>> operations;
+  std::string planPath;
+  // operations for [rank][threadblock] = [operations]
+  std::unordered_map<int, std::vector<std::vector<Operation>>> operations;
   std::unordered_map<int, std::vector<ChannelInfo>> channelInfos;
+  // threadblockChannelMap[rank][threadblock] = [channelIndex]
+  std::unordered_map<int, std::vector<std::vector<std::pair<int, ChannelKey>>>> threadblockSMChannelMap;
+  std::unordered_map<int, std::vector<std::vector<std::pair<int, ChannelKey>>>> threadblockProxyChannelMap;
   std::string name;
   std::unordered_map<int, uint32_t> inputChunks;
   std::unordered_map<int, uint32_t> outputChunks;
   std::unordered_map<int, uint32_t> scratchChunks;
+  size_t chunkSize;
 };
 
 }  // namespace mscclpp
diff --git a/test/execution-files/allreduce.json b/test/execution-files/allreduce.json
index 58db26c08..b2032e90b 100644
--- a/test/execution-files/allreduce.json
+++ b/test/execution-files/allreduce.json
@@ -15,27 +15,33 @@
           "ops": [
             {
               "name": "signal",
+              "o_buff": {
+                "src": "i",
+                "dst": "i"
+              },
               "o_cids": [
                 {
                   "id": 0,
                   "off": 2
                 }
               ],
-              "srcbuff": "i",
-              "dstbuff": "i",
-              "ctype": "sm"
+              "ctype": "sm",
+              "cnt": 1
             },
             {
               "name": "wait",
+              "i_buff": {
+                "src": "i",
+                "dst": "i"
+              },
               "i_cids": [
                 {
                   "id": 0,
                   "off": 0
                 }
               ],
-              "srcbuff": "i",
-              "dstbuff": "i",
-              "ctype": "sm"
+              "ctype": "sm",
+              "cnt": 1
             },
             {
               "name": "nop",
@@ -47,20 +53,28 @@
               ]
             },
             {
-              "name": "rrcs",
+              "name": "rrs",
+              "i_buff": {
+                "src": "i",
+                "dst": "i"
+              },
               "i_cids": [
                 {
                   "id": 0,
                   "off": 0
                 }
               ],
+              "o_buff": {
+                "src": "i",
+                "dst": "i"
+              },
               "o_cids": [
                 {
                   "id": 0,
                   "off": 0
                 }
               ],
-              "srcbuff": "i",
+              "dst": 0,
               "dstbuff": "i",
               "dstoff": 0,
               "ctype": "sm",
@@ -77,27 +91,33 @@
             },
             {
               "name": "signal",
+              "o_buff": {
+                "src": "i",
+                "dst": "i"
+              },
               "o_cids": [
                 {
                   "id": 0,
                   "off": 0
                 }
               ],
-              "srcbuff": "i",
-              "dstbuff": "i",
-              "ctype": "sm"
+              "ctype": "sm",
+              "cnt": 1
             },
             {
               "name": "wait",
+              "i_buff": {
+                "src": "i",
+                "dst": "i"
+              },
               "i_cids": [
                 {
                   "id": 0,
                   "off": 2
                 }
               ],
-              "srcbuff": "i",
-              "dstbuff": "i",
-              "ctype": "sm"
+              "ctype": "sm",
+              "cnt": 1
             }
           ],
           "channels": [
@@ -105,7 +125,7 @@
               "src": "i",
               "dst": "i",
               "ctype": "sm",
-              "cid": [
+              "cids": [
                 0
               ]
             }
@@ -116,27 +136,33 @@
           "ops": [
             {
               "name": "signal",
+              "o_buff": {
+                "src": "i",
+                "dst": "i"
+              },
               "o_cids": [
                 {
                   "id": 0,
                   "off": 3
                 }
               ],
-              "srcbuff": "i",
-              "dstbuff": "i",
-              "ctype": "sm"
+              "ctype": "sm",
+              "cnt": 1
             },
             {
               "name": "wait",
+              "i_buff": {
+                "src": "i",
+                "dst": "i"
+              },
               "i_cids": [
                 {
                   "id": 0,
                   "off": 1
                 }
               ],
-              "srcbuff": "i",
-              "dstbuff": "i",
-              "ctype": "sm"
+              "ctype": "sm",
+              "cnt": 1
             },
             {
               "name": "nop",
@@ -148,20 +174,28 @@
               ]
             },
             {
-              "name": "rrcs",
+              "name": "rrs",
+              "i_buff": {
+                "src": "i",
+                "dst": "i"
+              },
               "i_cids": [
                 {
                   "id": 0,
                   "off": 1
                 }
               ],
+              "o_buff": {
+                "src": "i",
+                "dst": "i"
+              },
               "o_cids": [
                 {
                   "id": 0,
                   "off": 0
                 }
               ],
-              "srcbuff": "i",
+              "dst": 0,
               "dstbuff": "i",
               "dstoff": 1,
               "ctype": "sm",
@@ -178,27 +212,33 @@
             },
             {
               "name": "signal",
+              "o_buff": {
+                "src": "i",
+                "dst": "i"
+              },
               "o_cids": [
                 {
                   "id": 0,
                   "off": 0
                 }
               ],
-              "srcbuff": "i",
-              "dstbuff": "i",
-              "ctype": "sm"
+              "ctype": "sm",
+              "cnt": 1
             },
             {
               "name": "wait",
+              "i_buff": {
+                "src": "i",
+                "dst": "i"
+              },
               "i_cids": [
                 {
                   "id": 0,
                   "off": 2
                 }
               ],
-              "srcbuff": "i",
-              "dstbuff": "i",
-              "ctype": "sm"
+              "ctype": "sm",
+              "cnt": 1
             }
           ],
           "channels": [
@@ -206,7 +246,7 @@
               "src": "i",
               "dst": "i",
               "ctype": "sm",
-              "cid": [
+              "cids": [
                 1
               ]
             }
@@ -236,27 +276,33 @@
           "ops": [
             {
               "name": "signal",
+              "o_buff": {
+                "src": "i",
+                "dst": "i"
+              },
               "o_cids": [
                 {
                   "id": 0,
                   "off": 0
                 }
               ],
-              "srcbuff": "i",
-              "dstbuff": "i",
-              "ctype": "sm"
+              "ctype": "sm",
+              "cnt": 1
             },
             {
               "name": "wait",
+              "i_buff": {
+                "src": "i",
+                "dst": "i"
+              },
               "i_cids": [
                 {
                   "id": 0,
                   "off": 2
                 }
               ],
-              "srcbuff": "i",
-              "dstbuff": "i",
-              "ctype": "sm"
+              "ctype": "sm",
+              "cnt": 1
             },
             {
               "name": "nop",
@@ -268,20 +314,28 @@
               ]
             },
             {
-              "name": "rrcs",
+              "name": "rrs",
+              "i_buff": {
+                "src": "i",
+                "dst": "i"
+              },
               "i_cids": [
                 {
                   "id": 0,
                   "off": 2
                 }
               ],
+              "o_buff": {
+                "src": "i",
+                "dst": "i"
+              },
               "o_cids": [
                 {
                   "id": 0,
                   "off": 2
                 }
               ],
-              "srcbuff": "i",
+              "dst": 1,
               "dstbuff": "i",
               "dstoff": 2,
               "ctype": "sm",
@@ -298,27 +352,33 @@
             },
             {
               "name": "signal",
+              "o_buff": {
+                "src": "i",
+                "dst": "i"
+              },
               "o_cids": [
                 {
                   "id": 0,
                   "off": 2
                 }
               ],
-              "srcbuff": "i",
-              "dstbuff": "i",
-              "ctype": "sm"
+              "ctype": "sm",
+              "cnt": 1
             },
             {
               "name": "wait",
+              "i_buff": {
+                "src": "i",
+                "dst": "i"
+              },
               "i_cids": [
                 {
                   "id": 0,
                   "off": 0
                 }
               ],
-              "srcbuff": "i",
-              "dstbuff": "i",
-              "ctype": "sm"
+              "ctype": "sm",
+              "cnt": 1
             }
           ],
           "channels": [
@@ -326,7 +386,7 @@
               "src": "i",
               "dst": "i",
               "ctype": "sm",
-              "cid": [
+              "cids": [
                 0
               ]
             }
@@ -337,27 +397,33 @@
           "ops": [
             {
               "name": "signal",
+              "o_buff": {
+                "src": "i",
+                "dst": "i"
+              },
               "o_cids": [
                 {
                   "id": 0,
                   "off": 1
                 }
               ],
-              "srcbuff": "i",
-              "dstbuff": "i",
-              "ctype": "sm"
+              "ctype": "sm",
+              "cnt": 1
             },
             {
               "name": "wait",
+              "i_buff": {
+                "src": "i",
+                "dst": "i"
+              },
               "i_cids": [
                 {
                   "id": 0,
                   "off": 3
                 }
               ],
-              "srcbuff": "i",
-              "dstbuff": "i",
-              "ctype": "sm"
+              "ctype": "sm",
+              "cnt": 1
             },
             {
               "name": "nop",
@@ -369,20 +435,28 @@
               ]
             },
             {
-              "name": "rrcs",
+              "name": "rrs",
+              "i_buff": {
+                "src": "i",
+                "dst": "i"
+              },
               "i_cids": [
                 {
                   "id": 0,
                   "off": 3
                 }
               ],
+              "o_buff": {
+                "src": "i",
+                "dst": "i"
+              },
               "o_cids": [
                 {
                   "id": 0,
                   "off": 2
                 }
               ],
-              "srcbuff": "i",
+              "dst": 1,
               "dstbuff": "i",
               "dstoff": 3,
               "ctype": "sm",
@@ -399,27 +473,33 @@
             },
             {
               "name": "signal",
+              "o_buff": {
+                "src": "i",
+                "dst": "i"
+              },
               "o_cids": [
                 {
                   "id": 0,
                   "off": 2
                 }
               ],
-              "srcbuff": "i",
-              "dstbuff": "i",
-              "ctype": "sm"
+              "ctype": "sm",
+              "cnt": 1
             },
             {
               "name": "wait",
+              "i_buff": {
+                "src": "i",
+                "dst": "i"
+              },
               "i_cids": [
                 {
                   "id": 0,
                   "off": 0
                 }
               ],
-              "srcbuff": "i",
-              "dstbuff": "i",
-              "ctype": "sm"
+              "ctype": "sm",
+              "cnt": 1
             }
           ],
           "channels": [
@@ -427,7 +507,7 @@
               "src": "i",
               "dst": "i",
               "ctype": "sm",
-              "cid": [
+              "cids": [
                 1
               ]
             }
diff --git a/test/executor_test.cc b/test/executor_test.cc
index ccc0356a1..865ba2122 100644
--- a/test/executor_test.cc
+++ b/test/executor_test.cc
@@ -1,6 +1,5 @@
 #include <mpi.h>
 
-#include <fstream>
 #include <mscclpp/executor.hpp>
 
 const std::string MSCCLPP_ROOT_PATH = "/root/mscclpp";
@@ -23,8 +22,7 @@ int main() {
   auto comm = std::make_shared<mscclpp::Communicator>(bootstrap);
   std::shared_ptr<mscclpp::Executor> executor = std::make_shared<mscclpp::Executor>(comm, 8 /*nranksPerNode*/);
 
-  std::ifstream file(MSCCLPP_ROOT_PATH + "/test/execution-files/allreduce.json");
-  mscclpp::ExecutionPlan plan(file);
+  mscclpp::ExecutionPlan plan(MSCCLPP_ROOT_PATH + "/test/execution-files/allreduce.json");
   std::shared_ptr<char> sendbuff = mscclpp::allocExtSharedCuda<char>(1024);
   std::shared_ptr<char> recvbuff = mscclpp::allocExtSharedCuda<char>(1024);
   executor->execute(rank, sendbuff.get(), recvbuff.get(), 1024, 1024, plan);

From a1a11da7ed83a76e0188482ea056a7d6fc6cf656 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Wed, 3 Apr 2024 08:27:18 +0000
Subject: [PATCH 15/51] update struct

---
 src/executor/executor.cc       |  3 +++
 src/include/execution_plan.hpp | 38 ++++++++++++++++++----------------
 2 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/src/executor/executor.cc b/src/executor/executor.cc
index 445ebccad..173870b74 100644
--- a/src/executor/executor.cc
+++ b/src/executor/executor.cc
@@ -247,6 +247,9 @@ struct Executor::Impl {
 
   void launchKernel(ExecutionContext& context) {
     // copy context to shared memory
+    // std::cout << sizeof(Channels) << std::endl;
+    // std::cout << sizeof(Operation) << std::endl;
+    // std::cout << sizeof(DeviceExecutionPlan) << std::endl;
     // launch kernel
   }
 };
diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp
index 768799846..25595b093 100644
--- a/src/include/execution_plan.hpp
+++ b/src/include/execution_plan.hpp
@@ -14,13 +14,13 @@
 
 namespace mscclpp {
 
-enum class BufferType {
+enum class BufferType : uint8_t {
   INPUT,
   OUTPUT,
   SCRATCH,
 };
 
-enum class ChannelType {
+enum class ChannelType : uint8_t {
   SM,
   PROXY,
 };
@@ -48,10 +48,11 @@ struct hash<mscclpp::ChannelKey> {
 
 namespace mscclpp {
 
-constexpr int MAX_CHANNEL = 24;
+constexpr int MAX_CHANNEL = 16;
 constexpr int MAX_CHANNEL_PER_OPERATION = 8;
+constexpr int MAX_OPERATION = 64;
 
-enum class OperationType {
+enum class OperationType : uint8_t {
   BARRIER,
   PUT,
   GET,
@@ -79,25 +80,26 @@ struct Channels {
 struct Operation {
   OperationType type;
   ChannelType channelType;
-  uint16_t nInputChannels;
-  uint16_t nOutputChannels;
-  uint16_t inputChannelIndex[MAX_CHANNEL_PER_OPERATION];
-  uint16_t outputChannelIndex[MAX_CHANNEL_PER_OPERATION];
-  size_t inputOffset[MAX_CHANNEL_PER_OPERATION];
-  size_t outputOffset[MAX_CHANNEL_PER_OPERATION];
   BufferType srcBufferType;
   BufferType dstBufferType;
-  size_t srcOffset;
-  size_t dstOffset;
-  size_t size;
+  uint8_t nInputChannels;
+  uint8_t nOutputChannels;
+  uint8_t inputChannelIndex[MAX_CHANNEL_PER_OPERATION];
+  uint8_t outputChannelIndex[MAX_CHANNEL_PER_OPERATION];
+  uint32_t inputOffset[MAX_CHANNEL_PER_OPERATION];
+  uint32_t outputOffset[MAX_CHANNEL_PER_OPERATION];
+  uint32_t srcOffset;
+  uint32_t dstOffset;
+  uint32_t size;
 };
 
+// total size = 1920 + 6400 + 4 + 4(padding) = 8324 bytes
 struct DeviceExecutionPlan {
-  int nSmChannels;
-  int nProxyChannels;
-  int nOperations;
-  Channels channels;
-  Operation operations[1];
+  uint8_t nSmChannels;                  // 1 bytes
+  uint8_t nProxyChannels;               // 1 bytes
+  uint16_t nOperations;                 // 2 bytes
+  Channels channels;                    // 1920 bytes
+  Operation operations[MAX_OPERATION];  // 64 * 100 = 6400 bytes
 };
 
 struct ExecutionPlan::Impl {

From 4b5668c0487f17d4f9d8df77a25ac34c06bc5128 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Wed, 3 Apr 2024 10:00:57 +0000
Subject: [PATCH 16/51] fix

---
 src/executor/execution_plan.cc | 27 ++++++++++++++++-----------
 src/include/execution_plan.hpp |  5 +++--
 test/executor_test.cc          |  2 +-
 3 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc
index 0dff4adbd..7753bc0b9 100644
--- a/src/executor/execution_plan.cc
+++ b/src/executor/execution_plan.cc
@@ -29,12 +29,14 @@ auto getOpType = [](const std::string& str) {
     return mscclpp::OperationType::WAIT;
   } else if (str == "flush") {
     return mscclpp::OperationType::FLUSH;
-  } else if (str == "reduce") {
+  } else if (str == "re") {
     return mscclpp::OperationType::REDUCE;
-  } else if (str == "read_reduce_copy") {
-    return mscclpp::OperationType::READ_REDUCE_COPY;
-  } else if (str == "read_reduce_copy_put") {
-    return mscclpp::OperationType::READ_REDUCE_COPY_PUT;
+  } else if (str == "rs") {
+    return mscclpp::OperationType::REDUCE_SEND;
+  } else if (str == "rr") {
+    return mscclpp::OperationType::READ_REDUCE;
+  } else if (str == "rrs") {
+    return mscclpp::OperationType::READ_REDUCE_SEND;
   } else {
     throw std::runtime_error("Invalid operation type");
   }
@@ -157,6 +159,9 @@ void ExecutionPlan::Impl::setupChannels(const json& gpus) {
         channelMap[key].push_back(i);
       }
     }
+    int nthreadblocks = gpu["threadblocks"].size();
+    this->threadblockSMChannelMap[rank].resize(nthreadblocks);
+    this->threadblockProxyChannelMap[rank].resize(nthreadblocks);
     for (const auto& threadblock : gpu["threadblocks"]) {
       for (const auto& channel : threadblock["channels"]) {
         ChannelType channelType = convertToChannelType(channel["ctype"]);
@@ -204,18 +209,18 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus) {
           operation.nOutputChannels = op["o_cids"].size();
         }
         for (int i = 0; i < operation.nInputChannels; i++) {
-          BufferType srcBufferType = convertToBufferType(op["i_buff"][i]["src"]);
-          BufferType dstBufferType = convertToBufferType(op["i_buff"][i]["dst"]);
+          BufferType srcBufferType = convertToBufferType(op["i_buff"]["src"]);
+          BufferType dstBufferType = convertToBufferType(op["i_buff"]["dst"]);
           operation.inputChannelIndex[i] =
               channelIndexes[{srcBufferType, dstBufferType, operation.channelType}][op["i_cids"][i]["id"]];
-          operation.inputOffset[i] = this->chunkSize * (int)op["i_cids"][i]["offset"];
+          operation.inputOffset[i] = this->chunkSize * (int)op["i_cids"][i]["off"];
         }
         for (int i = 0; i < operation.nOutputChannels; i++) {
-          BufferType srcBufferType = convertToBufferType(op["o_buff"][i]["src"]);
-          BufferType dstBufferType = convertToBufferType(op["o_buff"][i]["dst"]);
+          BufferType srcBufferType = convertToBufferType(op["o_buff"]["src"]);
+          BufferType dstBufferType = convertToBufferType(op["o_buff"]["dst"]);
           operation.outputChannelIndex[i] =
               channelIndexes[{srcBufferType, dstBufferType, operation.channelType}][op["o_cids"][i]["id"]];
-          operation.outputOffset[i] = this->chunkSize * (int)op["o_cids"][i]["offset"];
+          operation.outputOffset[i] = this->chunkSize * (int)op["o_cids"][i]["off"];
         }
         if (op.contains("srcbuff")) {
           operation.srcBufferType = convertToBufferType(op["srcbuff"]);
diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp
index 25595b093..bfa51b503 100644
--- a/src/include/execution_plan.hpp
+++ b/src/include/execution_plan.hpp
@@ -61,8 +61,9 @@ enum class OperationType : uint8_t {
   WAIT,
   FLUSH,
   REDUCE,
-  READ_REDUCE_COPY,
-  READ_REDUCE_COPY_PUT,
+  REDUCE_SEND,
+  READ_REDUCE,
+  READ_REDUCE_SEND,
 };
 
 struct ChannelInfo {
diff --git a/test/executor_test.cc b/test/executor_test.cc
index 865ba2122..995049ccf 100644
--- a/test/executor_test.cc
+++ b/test/executor_test.cc
@@ -18,7 +18,7 @@ int main() {
   MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
   bootstrap->initialize(id);
   // sleep 10s
-  // std::this_thread::sleep_for(std::chrono::seconds(20));
+  std::this_thread::sleep_for(std::chrono::seconds(20));
   auto comm = std::make_shared<mscclpp::Communicator>(bootstrap);
   std::shared_ptr<mscclpp::Executor> executor = std::make_shared<mscclpp::Executor>(comm, 8 /*nranksPerNode*/);
 

From d47ac6581d1beb63667d424dad6e9661e758a6d1 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Wed, 3 Apr 2024 14:29:17 +0000
Subject: [PATCH 17/51] try to launch kernel

---
 include/mscclpp/executor.hpp     |  2 +-
 src/CMakeLists.txt               |  2 +-
 src/executor/execution_kernel.cu | 29 ++++--------
 src/executor/executor.cc         | 31 ++++++++-----
 src/include/execution_kernel.hpp | 78 ++++++++++++++++++++++++++++++++
 src/include/execution_plan.hpp   | 63 +-------------------------
 test/executor_test.cc            |  2 +-
 7 files changed, 112 insertions(+), 95 deletions(-)
 create mode 100644 src/include/execution_kernel.hpp

diff --git a/include/mscclpp/executor.hpp b/include/mscclpp/executor.hpp
index 076238336..985ffba3a 100644
--- a/include/mscclpp/executor.hpp
+++ b/include/mscclpp/executor.hpp
@@ -29,7 +29,7 @@ class Executor {
   Executor& operator=(const Executor&) = delete;
   ~Executor();
 
-  void execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize,
+  void execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, int nthreads,
                const ExecutionPlan& plan);
 
  private:
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index cfbcc927a..45b4075d2 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,6 +1,6 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc)
+file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc *.cu)
 target_sources(mscclpp_obj PRIVATE ${SOURCES})
 target_include_directories(mscclpp_obj PRIVATE include)
diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu
index 6b467dbcb..9b22bcc0d 100644
--- a/src/executor/execution_kernel.cu
+++ b/src/executor/execution_kernel.cu
@@ -1,23 +1,12 @@
-// // Copyright (c) Microsoft Corporation.
-// // Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
 
-// #include "execution_plan.hpp"
+#include "execution_kernel.hpp"
 
-// extern __shared__ mscclpp::DeviceExecutionPlan sharedMem[];
+namespace mscclpp {
+__global__ void kernel(DeviceExecutionPlan* plan) {}
 
-// __global__ void commnuication_kernel(void* sendbuff, void* recvbuff, void* scratchbuff, size_t chunkSize) {
-//   // read data from shared memory
-//   // 1. get the number of command from shared memory
-//   int nOps = sharedMem->nOperations;
-//   mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannel = sharedMem->channels.smChannels;
-//   mscclpp::DeviceHandle<mscclpp::ProxyChannel>* proxyChannel = sharedMem->channels.proxyChannels;
-//   for (int opId = 0; opId < nOps; opId++) {
-//     // 2. get the command
-//     mscclpp::Operation* op = sharedMem->operations + opId;
-//     // 3. execute the command
-//     switch (op->type) {
-//       default:
-//         break;
-//     }
-//   }
-// }
+void ExecutionKernel::launchKernel(int nthreadblocks, int nthreads, DeviceExecutionPlan* plan, cudaStream_t stream) {
+  kernel<<<nthreadblocks, nthreads, 0, stream>>>(plan);
+}
+}  // namespace mscclpp
diff --git a/src/executor/executor.cc b/src/executor/executor.cc
index 173870b74..26717d2d4 100644
--- a/src/executor/executor.cc
+++ b/src/executor/executor.cc
@@ -6,6 +6,7 @@
 #include <mscclpp/sm_channel.hpp>
 #include <set>
 
+#include "execution_kernel.hpp"
 #include "execution_plan.hpp"
 
 namespace mscclpp {
@@ -60,9 +61,9 @@ struct ExecutionContext {
   std::vector<mscclpp::SmChannel> smChannels;
   std::vector<mscclpp::SimpleProxyChannel> proxyChannels;
   std::vector<DeviceExecutionPlan> deviceExecutionPlans;
-  std::vector<std::vector<Operation>> operations;
   std::shared_ptr<char> scratchBuffer;
   size_t scratchBufferSize;
+  std::shared_ptr<char> deviceExecutionPlansBuffer;
 };
 
 struct Executor::Impl {
@@ -70,8 +71,10 @@ struct Executor::Impl {
   std::shared_ptr<Communicator> comm;
   std::shared_ptr<ProxyService> proxyService;
   std::unordered_map<ExecutionContextKey, ExecutionContext> contexts;
+  CudaStreamWithFlags stream;
 
-  Impl(std::shared_ptr<Communicator> comm, int nranksPerNode) : nranksPerNode(nranksPerNode), comm(comm) {
+  Impl(std::shared_ptr<Communicator> comm, int nranksPerNode)
+      : nranksPerNode(nranksPerNode), comm(comm), stream(cudaStreamNonBlocking) {
     this->proxyService = std::make_shared<ProxyService>();
   }
   ~Impl() = default;
@@ -93,6 +96,12 @@ struct Executor::Impl {
     this->setupRegisteredMemories(context, sendbuff, recvbuff, sendBufferSize, recvBufferSize, rank, plan);
     this->setupChannels(context, sendbuff, recvbuff, sendBufferSize, rank, plan);
     this->setupDeviceExecutionPlan(context, rank, plan);
+    context.deviceExecutionPlansBuffer =
+        allocExtSharedCuda<char>(context.deviceExecutionPlans.size() * sizeof(DeviceExecutionPlan));
+    MSCCLPP_CUDATHROW(cudaMemcpyAsync(context.deviceExecutionPlansBuffer.get(), context.deviceExecutionPlans.data(),
+                                      context.deviceExecutionPlans.size() * sizeof(DeviceExecutionPlan),
+                                      cudaMemcpyHostToDevice, stream));
+    MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
     return context;
   }
 
@@ -230,7 +239,6 @@ struct Executor::Impl {
     for (int threadblock = 0; threadblock < plan.impl_->getThreadblockCount(rank); threadblock++) {
       DeviceExecutionPlan deviceExecutionPlan;
       std::vector<Operation> ops = plan.impl_->getOperations(rank, threadblock);
-      context.operations.emplace_back(std::move(ops));
       deviceExecutionPlan.nOperations = ops.size();
       deviceExecutionPlan.nSmChannels = plan.impl_->threadblockSMChannelMap.at(rank).at(threadblock).size();
       deviceExecutionPlan.nProxyChannels = plan.impl_->threadblockProxyChannelMap.at(rank).at(threadblock).size();
@@ -240,28 +248,29 @@ struct Executor::Impl {
       for (const auto& [index, key] : plan.impl_->threadblockProxyChannelMap.at(rank).at(threadblock)) {
         deviceExecutionPlan.channels.proxyChannels[index] = mscclpp::deviceHandle(context.proxyChannels[index]);
       }
+      for (size_t i = 0; i < ops.size(); i++) {
+        deviceExecutionPlan.operations[i] = ops[i];
+      }
       deviceExecutionPlans.push_back(deviceExecutionPlan);
     }
     context.deviceExecutionPlans = std::move(deviceExecutionPlans);
   }
 
-  void launchKernel(ExecutionContext& context) {
-    // copy context to shared memory
-    // std::cout << sizeof(Channels) << std::endl;
-    // std::cout << sizeof(Operation) << std::endl;
-    // std::cout << sizeof(DeviceExecutionPlan) << std::endl;
-    // launch kernel
+  void launchKernel(ExecutionContext& context, int nthreadsPerBlock) {
+    int nthreadblocks = context.deviceExecutionPlans.size();
+    ExecutionKernel::launchKernel(nthreadblocks, nthreadsPerBlock,
+                                  (DeviceExecutionPlan*)context.deviceExecutionPlansBuffer.get(), this->stream);
   }
 };
 
 Executor::Executor(std::shared_ptr<Communicator> comm, int nranksPerNode)
     : impl_(std::make_unique<Impl>(comm, nranksPerNode)) {}
 
-void Executor::execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize,
+void Executor::execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, int nthreads,
                        const ExecutionPlan& plan) {
   ExecutionContext context =
       this->impl_->setupExecutionContext(rank, sendbuff, recvBuff, sendBuffSize, recvBuffSize, plan);
-  this->impl_->launchKernel(context);
+  this->impl_->launchKernel(context, nthreads);
 }
 
 Executor::~Executor() = default;
diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp
new file mode 100644
index 000000000..e0c607a7a
--- /dev/null
+++ b/src/include/execution_kernel.hpp
@@ -0,0 +1,78 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef MSCCLPP_EXECUTION_KERNEL_HPP_
+#define MSCCLPP_EXECUTION_KERNEL_HPP_
+
+#include <mscclpp/proxy_channel.hpp>
+#include <mscclpp/sm_channel.hpp>
+
+namespace mscclpp {
+
+constexpr int MAX_CHANNEL = 16;
+constexpr int MAX_CHANNEL_PER_OPERATION = 8;
+constexpr int MAX_OPERATION = 64;
+
+enum class BufferType : uint8_t {
+  INPUT,
+  OUTPUT,
+  SCRATCH,
+};
+
+enum class ChannelType : uint8_t {
+  SM,
+  PROXY,
+};
+
+enum class OperationType : uint8_t {
+  BARRIER,
+  PUT,
+  GET,
+  COPY,
+  SIGNAL,
+  WAIT,
+  FLUSH,
+  REDUCE,
+  REDUCE_SEND,
+  READ_REDUCE,
+  READ_REDUCE_SEND,
+};
+
+struct Channels {
+  mscclpp::DeviceHandle<mscclpp::SmChannel> smChannels[MAX_CHANNEL];
+  mscclpp::DeviceHandle<mscclpp::SimpleProxyChannel> proxyChannels[MAX_CHANNEL];
+};
+
+struct Operation {
+  OperationType type;
+  ChannelType channelType;
+  BufferType srcBufferType;
+  BufferType dstBufferType;
+  uint8_t nInputChannels;
+  uint8_t nOutputChannels;
+  uint8_t inputChannelIndex[MAX_CHANNEL_PER_OPERATION];
+  uint8_t outputChannelIndex[MAX_CHANNEL_PER_OPERATION];
+  uint32_t inputOffset[MAX_CHANNEL_PER_OPERATION];
+  uint32_t outputOffset[MAX_CHANNEL_PER_OPERATION];
+  uint32_t srcOffset;
+  uint32_t dstOffset;
+  uint32_t size;
+};
+
+// total size = 1920 + 6400 + 4 + 4(padding) = 8324 bytes
+struct DeviceExecutionPlan {
+  uint8_t nSmChannels;                  // 1 bytes
+  uint8_t nProxyChannels;               // 1 bytes
+  uint16_t nOperations;                 // 2 bytes
+  Channels channels;                    // 1920 bytes
+  Operation operations[MAX_OPERATION];  // 64 * 100 = 6400 bytes
+};
+
+class ExecutionKernel {
+ public:
+  static void launchKernel(int nthreadblocks, int nthreads, DeviceExecutionPlan* plan, cudaStream_t stream);
+};
+
+}  // namespace mscclpp
+
+#endif  // MSCCLPP_EXECUTION_KERNEL_HPP_
diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp
index bfa51b503..3575390ba 100644
--- a/src/include/execution_plan.hpp
+++ b/src/include/execution_plan.hpp
@@ -6,24 +6,13 @@
 
 #include <mscclpp/core.hpp>
 #include <mscclpp/executor.hpp>
-#include <mscclpp/proxy_channel.hpp>
-#include <mscclpp/sm_channel.hpp>
 #include <nlohmann/json.hpp>
 #include <string>
 #include <unordered_map>
 
-namespace mscclpp {
-
-enum class BufferType : uint8_t {
-  INPUT,
-  OUTPUT,
-  SCRATCH,
-};
+#include "execution_kernel.hpp"
 
-enum class ChannelType : uint8_t {
-  SM,
-  PROXY,
-};
+namespace mscclpp {
 
 struct ChannelKey {
   BufferType srcBufferType;
@@ -48,24 +37,6 @@ struct hash<mscclpp::ChannelKey> {
 
 namespace mscclpp {
 
-constexpr int MAX_CHANNEL = 16;
-constexpr int MAX_CHANNEL_PER_OPERATION = 8;
-constexpr int MAX_OPERATION = 64;
-
-enum class OperationType : uint8_t {
-  BARRIER,
-  PUT,
-  GET,
-  COPY,
-  SIGNAL,
-  WAIT,
-  FLUSH,
-  REDUCE,
-  REDUCE_SEND,
-  READ_REDUCE,
-  READ_REDUCE_SEND,
-};
-
 struct ChannelInfo {
   BufferType srcBufferType;
   BufferType dstBufferType;
@@ -73,36 +44,6 @@ struct ChannelInfo {
   std::vector<int> connectedPeers;
 };
 
-struct Channels {
-  mscclpp::DeviceHandle<mscclpp::SmChannel> smChannels[MAX_CHANNEL];
-  mscclpp::DeviceHandle<mscclpp::SimpleProxyChannel> proxyChannels[MAX_CHANNEL];
-};
-
-struct Operation {
-  OperationType type;
-  ChannelType channelType;
-  BufferType srcBufferType;
-  BufferType dstBufferType;
-  uint8_t nInputChannels;
-  uint8_t nOutputChannels;
-  uint8_t inputChannelIndex[MAX_CHANNEL_PER_OPERATION];
-  uint8_t outputChannelIndex[MAX_CHANNEL_PER_OPERATION];
-  uint32_t inputOffset[MAX_CHANNEL_PER_OPERATION];
-  uint32_t outputOffset[MAX_CHANNEL_PER_OPERATION];
-  uint32_t srcOffset;
-  uint32_t dstOffset;
-  uint32_t size;
-};
-
-// total size = 1920 + 6400 + 4 + 4(padding) = 8324 bytes
-struct DeviceExecutionPlan {
-  uint8_t nSmChannels;                  // 1 bytes
-  uint8_t nProxyChannels;               // 1 bytes
-  uint16_t nOperations;                 // 2 bytes
-  Channels channels;                    // 1920 bytes
-  Operation operations[MAX_OPERATION];  // 64 * 100 = 6400 bytes
-};
-
 struct ExecutionPlan::Impl {
  public:
   Impl(std::string planPath);
diff --git a/test/executor_test.cc b/test/executor_test.cc
index 995049ccf..8a82f9fc3 100644
--- a/test/executor_test.cc
+++ b/test/executor_test.cc
@@ -25,7 +25,7 @@ int main() {
   mscclpp::ExecutionPlan plan(MSCCLPP_ROOT_PATH + "/test/execution-files/allreduce.json");
   std::shared_ptr<char> sendbuff = mscclpp::allocExtSharedCuda<char>(1024);
   std::shared_ptr<char> recvbuff = mscclpp::allocExtSharedCuda<char>(1024);
-  executor->execute(rank, sendbuff.get(), recvbuff.get(), 1024, 1024, plan);
+  executor->execute(rank, sendbuff.get(), recvbuff.get(), 1024, 1024, 512, plan);
 
   MPI_Finalize();
   return 0;

From 0b4c19a89ce72546720839c249d997801b30c365 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Wed, 3 Apr 2024 14:50:46 +0000
Subject: [PATCH 18/51] WIP

---
 src/executor/execution_kernel.cu | 26 +++++++++++++++++++++++---
 src/executor/executor.cc         |  4 +++-
 src/include/execution_kernel.hpp |  3 ++-
 3 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu
index 9b22bcc0d..4bcd4e7e9 100644
--- a/src/executor/execution_kernel.cu
+++ b/src/executor/execution_kernel.cu
@@ -1,12 +1,32 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
+#include <mscclpp/device.hpp>
+
 #include "execution_kernel.hpp"
 
+#if defined(MSCCLPP_DEVICE_HIP)
+#define __synclds() asm volatile("s_waitcnt lgkmcnt(0) \n s_barrier");
+#endif  // defined(MSCCLPP_DEVICE_HIP)
+
 namespace mscclpp {
-__global__ void kernel(DeviceExecutionPlan* plan) {}
+__global__ void kernel(DeviceExecutionPlan* plan) {
+  extern __shared__ int sharedMem[];
+  int bid = blockIdx.x;
+  int tid = threadIdx.x;
+  DeviceExecutionPlan* localPlan = plan + bid;
+  for (int i = tid; i < sizeof(DeviceExecutionPlan); i += blockDim.x) {
+    sharedMem[i] = ((int*)localPlan)[i];
+  }
+#if defined(MSCCLPP_DEVICE_HIP)
+  __synclds();
+#else   // !defined(MSCCLPP_DEVICE_HIP)
+  __syncthreads();
+#endif  // !defined(MSCCLPP_DEVICE_HIP)
+}
 
-void ExecutionKernel::launchKernel(int nthreadblocks, int nthreads, DeviceExecutionPlan* plan, cudaStream_t stream) {
-  kernel<<<nthreadblocks, nthreads, 0, stream>>>(plan);
+void ExecutionKernel::launchKernel(int nthreadblocks, int nthreads, DeviceExecutionPlan* plan, size_t sharedMemSize,
+                                   cudaStream_t stream) {
+  kernel<<<nthreadblocks, nthreads, sharedMemSize, stream>>>(plan);
 }
 }  // namespace mscclpp
diff --git a/src/executor/executor.cc b/src/executor/executor.cc
index 26717d2d4..507c606d6 100644
--- a/src/executor/executor.cc
+++ b/src/executor/executor.cc
@@ -258,8 +258,10 @@ struct Executor::Impl {
 
   void launchKernel(ExecutionContext& context, int nthreadsPerBlock) {
     int nthreadblocks = context.deviceExecutionPlans.size();
+    size_t sharedMemSize = sizeof(DeviceExecutionPlan);
     ExecutionKernel::launchKernel(nthreadblocks, nthreadsPerBlock,
-                                  (DeviceExecutionPlan*)context.deviceExecutionPlansBuffer.get(), this->stream);
+                                  (DeviceExecutionPlan*)context.deviceExecutionPlansBuffer.get(), sharedMemSize,
+                                  this->stream);
   }
 };
 
diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp
index e0c607a7a..2dfda4011 100644
--- a/src/include/execution_kernel.hpp
+++ b/src/include/execution_kernel.hpp
@@ -70,7 +70,8 @@ struct DeviceExecutionPlan {
 
 class ExecutionKernel {
  public:
-  static void launchKernel(int nthreadblocks, int nthreads, DeviceExecutionPlan* plan, cudaStream_t stream);
+  static void launchKernel(int nthreadblocks, int nthreads, DeviceExecutionPlan* plan, size_t sharedMemSize,
+                           cudaStream_t stream);
 };
 
 }  // namespace mscclpp

From c14aac266cf3ed993a9016bccd71bc171045a57d Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Thu, 4 Apr 2024 03:41:26 +0000
Subject: [PATCH 19/51] WIP

---
 include/mscclpp/executor.hpp     |  2 +-
 src/executor/execution_kernel.cu | 69 +++++++++++++++++++++++++++++---
 src/executor/executor.cc         | 19 +++++----
 src/include/execution_kernel.hpp |  2 +-
 test/executor_test.cc            | 19 +++++++--
 5 files changed, 91 insertions(+), 20 deletions(-)

diff --git a/include/mscclpp/executor.hpp b/include/mscclpp/executor.hpp
index 985ffba3a..a98853776 100644
--- a/include/mscclpp/executor.hpp
+++ b/include/mscclpp/executor.hpp
@@ -30,7 +30,7 @@ class Executor {
   ~Executor();
 
   void execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, int nthreads,
-               const ExecutionPlan& plan);
+               const ExecutionPlan& plan, cudaStream_t stream);
 
  private:
   struct Impl;
diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu
index 4bcd4e7e9..47333b62c 100644
--- a/src/executor/execution_kernel.cu
+++ b/src/executor/execution_kernel.cu
@@ -10,12 +10,39 @@
 #endif  // defined(MSCCLPP_DEVICE_HIP)
 
 namespace mscclpp {
-__global__ void kernel(DeviceExecutionPlan* plan) {
+
+MSCCLPP_DEVICE_INLINE void handleSignal(int tid, DeviceHandle<SmChannel>* smChannels,
+                                        DeviceHandle<SimpleProxyChannel>* proxyChannels, uint8_t* channelIndex,
+                                        int nChannels, ChannelType chType) {
+  if (tid < nChannels) {
+    if (chType == ChannelType::SM) {
+      smChannels[channelIndex[tid]].signal();
+    }
+    if (chType == ChannelType::PROXY) {
+      proxyChannels[channelIndex[tid]].signal();
+    }
+  }
+}
+
+MSCCLPP_DEVICE_INLINE void handleWait(int tid, DeviceHandle<SmChannel>* smChannels,
+                                      DeviceHandle<SimpleProxyChannel>* proxyChannels, uint8_t* channelIndex,
+                                      int nChannels, ChannelType chType) {
+  if (tid < nChannels) {
+    if (chType == ChannelType::SM) {
+      smChannels[channelIndex[tid]].wait();
+    }
+    if (chType == ChannelType::PROXY) {
+      proxyChannels[channelIndex[tid]].wait();
+    }
+  }
+}
+
+__global__ void kernel(int rank, DeviceExecutionPlan* plan) {
   extern __shared__ int sharedMem[];
   int bid = blockIdx.x;
   int tid = threadIdx.x;
   DeviceExecutionPlan* localPlan = plan + bid;
-  for (int i = tid; i < sizeof(DeviceExecutionPlan); i += blockDim.x) {
+  for (int i = tid; i < sizeof(DeviceExecutionPlan) / sizeof(int); i += blockDim.x) {
     sharedMem[i] = ((int*)localPlan)[i];
   }
 #if defined(MSCCLPP_DEVICE_HIP)
@@ -23,10 +50,42 @@ __global__ void kernel(DeviceExecutionPlan* plan) {
 #else   // !defined(MSCCLPP_DEVICE_HIP)
   __syncthreads();
 #endif  // !defined(MSCCLPP_DEVICE_HIP)
+  Operation* operations = localPlan->operations;
+  DeviceHandle<SmChannel>* smChannels = localPlan->channels.smChannels;
+  DeviceHandle<SimpleProxyChannel>* proxyChannels = localPlan->channels.proxyChannels;
+  if (bid > 0) {
+    return;
+  }
+  for (int i = 0; i < localPlan->nOperations; i++) {
+    switch (operations[i].type) {
+      case OperationType::BARRIER:
+        __syncthreads();
+        break;
+      case OperationType::SIGNAL:
+        // if (tid == 0) {
+        //   printf("rank: %d bid: %d, noutputchannels: %d outputChannelIndex %d\n", rank, bid,
+        //          operations[i].nOutputChannels, operations[i].outputChannelIndex[0]);
+        // }
+        handleSignal(tid, smChannels, proxyChannels, operations[i].outputChannelIndex, operations[i].nOutputChannels,
+                     operations[i].channelType);
+        break;
+      case OperationType::WAIT:
+        // if (tid == 0) {
+        //   printf("rank: %d bid: %d, ninputchannels: %d inputChannelIndex %d\n", rank, bid,
+        //   operations[i].nInputChannels,
+        //          operations[i].inputChannelIndex[0]);
+        // }
+        handleWait(tid, smChannels, proxyChannels, operations[i].inputChannelIndex, operations[i].nInputChannels,
+                   operations[i].channelType);
+        break;
+      default:
+        break;
+    }
+  }
 }
 
-void ExecutionKernel::launchKernel(int nthreadblocks, int nthreads, DeviceExecutionPlan* plan, size_t sharedMemSize,
-                                   cudaStream_t stream) {
-  kernel<<<nthreadblocks, nthreads, sharedMemSize, stream>>>(plan);
+void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, DeviceExecutionPlan* plan,
+                                   size_t sharedMemSize, cudaStream_t stream) {
+  kernel<<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, plan);
 }
 }  // namespace mscclpp
diff --git a/src/executor/executor.cc b/src/executor/executor.cc
index 507c606d6..dcdebd7ec 100644
--- a/src/executor/executor.cc
+++ b/src/executor/executor.cc
@@ -71,16 +71,14 @@ struct Executor::Impl {
   std::shared_ptr<Communicator> comm;
   std::shared_ptr<ProxyService> proxyService;
   std::unordered_map<ExecutionContextKey, ExecutionContext> contexts;
-  CudaStreamWithFlags stream;
 
-  Impl(std::shared_ptr<Communicator> comm, int nranksPerNode)
-      : nranksPerNode(nranksPerNode), comm(comm), stream(cudaStreamNonBlocking) {
+  Impl(std::shared_ptr<Communicator> comm, int nranksPerNode) : nranksPerNode(nranksPerNode), comm(comm) {
     this->proxyService = std::make_shared<ProxyService>();
   }
   ~Impl() = default;
 
   ExecutionContext setupExecutionContext(int rank, void* sendbuff, void* recvbuff, size_t sendBufferSize,
-                                         size_t recvBufferSize, const ExecutionPlan& plan) {
+                                         size_t recvBufferSize, const ExecutionPlan& plan, cudaStream_t stream) {
     ExecutionContextKey key = {sendbuff, recvbuff, sendBufferSize, recvBufferSize, plan.impl_->name};
     if (this->contexts.find(key) != this->contexts.end()) {
       return this->contexts[key];
@@ -102,6 +100,7 @@ struct Executor::Impl {
                                       context.deviceExecutionPlans.size() * sizeof(DeviceExecutionPlan),
                                       cudaMemcpyHostToDevice, stream));
     MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
+    this->contexts.insert({key, context});
     return context;
   }
 
@@ -256,12 +255,12 @@ struct Executor::Impl {
     context.deviceExecutionPlans = std::move(deviceExecutionPlans);
   }
 
-  void launchKernel(ExecutionContext& context, int nthreadsPerBlock) {
+  void launchKernel(ExecutionContext& context, int rank, int nthreadsPerBlock, cudaStream_t stream) {
     int nthreadblocks = context.deviceExecutionPlans.size();
     size_t sharedMemSize = sizeof(DeviceExecutionPlan);
-    ExecutionKernel::launchKernel(nthreadblocks, nthreadsPerBlock,
+    ExecutionKernel::launchKernel(rank, nthreadblocks, nthreadsPerBlock,
                                   (DeviceExecutionPlan*)context.deviceExecutionPlansBuffer.get(), sharedMemSize,
-                                  this->stream);
+                                  stream);
   }
 };
 
@@ -269,10 +268,10 @@ Executor::Executor(std::shared_ptr<Communicator> comm, int nranksPerNode)
     : impl_(std::make_unique<Impl>(comm, nranksPerNode)) {}
 
 void Executor::execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, int nthreads,
-                       const ExecutionPlan& plan) {
+                       const ExecutionPlan& plan, cudaStream_t stream) {
   ExecutionContext context =
-      this->impl_->setupExecutionContext(rank, sendbuff, recvBuff, sendBuffSize, recvBuffSize, plan);
-  this->impl_->launchKernel(context, nthreads);
+      this->impl_->setupExecutionContext(rank, sendbuff, recvBuff, sendBuffSize, recvBuffSize, plan, stream);
+  this->impl_->launchKernel(context, rank, nthreads, stream);
 }
 
 Executor::~Executor() = default;
diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp
index 2dfda4011..ff79dcc39 100644
--- a/src/include/execution_kernel.hpp
+++ b/src/include/execution_kernel.hpp
@@ -70,7 +70,7 @@ struct DeviceExecutionPlan {
 
 class ExecutionKernel {
  public:
-  static void launchKernel(int nthreadblocks, int nthreads, DeviceExecutionPlan* plan, size_t sharedMemSize,
+  static void launchKernel(int rank, int nthreadblocks, int nthreads, DeviceExecutionPlan* plan, size_t sharedMemSize,
                            cudaStream_t stream);
 };
 
diff --git a/test/executor_test.cc b/test/executor_test.cc
index 8a82f9fc3..76a580d36 100644
--- a/test/executor_test.cc
+++ b/test/executor_test.cc
@@ -1,6 +1,17 @@
 #include <mpi.h>
 
 #include <mscclpp/executor.hpp>
+#include <mscclpp/gpu_utils.hpp>
+
+// Check CUDA RT calls
+#define CUDACHECK(cmd)                                                                  \
+  do {                                                                                  \
+    cudaError_t err = cmd;                                                              \
+    if (err != cudaSuccess) {                                                           \
+      printf("%s:%d Cuda failure '%s'\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
+      exit(EXIT_FAILURE);                                                               \
+    }                                                                                   \
+  } while (false)
 
 const std::string MSCCLPP_ROOT_PATH = "/root/mscclpp";
 
@@ -17,15 +28,17 @@ int main() {
   }
   MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
   bootstrap->initialize(id);
-  // sleep 10s
-  std::this_thread::sleep_for(std::chrono::seconds(20));
+  // sleep 20s
+  // std::this_thread::sleep_for(std::chrono::seconds(20));
   auto comm = std::make_shared<mscclpp::Communicator>(bootstrap);
   std::shared_ptr<mscclpp::Executor> executor = std::make_shared<mscclpp::Executor>(comm, 8 /*nranksPerNode*/);
 
   mscclpp::ExecutionPlan plan(MSCCLPP_ROOT_PATH + "/test/execution-files/allreduce.json");
   std::shared_ptr<char> sendbuff = mscclpp::allocExtSharedCuda<char>(1024);
   std::shared_ptr<char> recvbuff = mscclpp::allocExtSharedCuda<char>(1024);
-  executor->execute(rank, sendbuff.get(), recvbuff.get(), 1024, 1024, 512, plan);
+  mscclpp::CudaStreamWithFlags stream(cudaStreamNonBlocking);
+  executor->execute(rank, sendbuff.get(), recvbuff.get(), 1024, 1024, 512, plan, stream);
+  CUDACHECK(cudaStreamSynchronize(stream));
 
   MPI_Finalize();
   return 0;

From d1c28bb3642b145d29456a787bdec492bce1c6d2 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Thu, 4 Apr 2024 07:51:20 +0000
Subject: [PATCH 20/51] pass build

---
 include/mscclpp/executor.hpp     |  11 +-
 src/executor/execution_kernel.cu | 192 ++++++++++++++++++++++++++++++-
 src/executor/executor.cc         |  15 +--
 src/include/execution_kernel.hpp |   7 +-
 test/executor_test.cc            |   6 +-
 5 files changed, 213 insertions(+), 18 deletions(-)

diff --git a/include/mscclpp/executor.hpp b/include/mscclpp/executor.hpp
index a98853776..21087a762 100644
--- a/include/mscclpp/executor.hpp
+++ b/include/mscclpp/executor.hpp
@@ -10,6 +10,13 @@
 
 namespace mscclpp {
 
+enum class DataType {
+  INT32,
+  UINT32,
+  FLOAT16,
+  FLOAT32,
+};
+
 class ExecutionPlan {
  public:
   ExecutionPlan(std::string planPath);
@@ -29,8 +36,8 @@ class Executor {
   Executor& operator=(const Executor&) = delete;
   ~Executor();
 
-  void execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, int nthreads,
-               const ExecutionPlan& plan, cudaStream_t stream);
+  void execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, DataType dataType,
+               int nthreads, const ExecutionPlan& plan, cudaStream_t stream);
 
  private:
   struct Impl;
diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu
index 47333b62c..3d2304735 100644
--- a/src/executor/execution_kernel.cu
+++ b/src/executor/execution_kernel.cu
@@ -9,6 +9,121 @@
 #define __synclds() asm volatile("s_waitcnt lgkmcnt(0) \n s_barrier");
 #endif  // defined(MSCCLPP_DEVICE_HIP)
 
+namespace {
+template <typename To, typename From>
+MSCCLPP_DEVICE_INLINE To bit_cast(const From& src) {
+  static_assert(sizeof(To) == sizeof(From), "Size mismatch for bit_cast");
+
+  union {
+    From f;
+    To t;
+  } u;
+  u.f = src;
+  return u.t;
+}
+
+template <typename T>
+MSCCLPP_DEVICE_INLINE T add_elements(T a, T b) {
+  return a + b;
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE __half2 add_elements(__half2 a, __half2 b) {
+  return __hadd2(a, b);
+}
+
+template <typename T>
+MSCCLPP_DEVICE_INLINE int4 add_vectors_helper(int4 a, int4 b) {
+  int4 ret;
+  ret.w = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.w), bit_cast<T, int>(b.w)));
+  ret.x = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.x), bit_cast<T, int>(b.x)));
+  ret.y = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.y), bit_cast<T, int>(b.y)));
+  ret.z = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.z), bit_cast<T, int>(b.z)));
+  return ret;
+}
+
+template <typename T>
+MSCCLPP_DEVICE_INLINE int4 add_vectors(int4 a, int4 b) {
+  return add_vectors_helper<T>(a, b);
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE int4 add_vectors<__half>(int4 a, int4 b) {
+  return add_vectors_helper<__half2>(a, b);
+}
+
+template <typename T>
+MSCCLPP_DEVICE_INLINE uint2 add_vectors_helper(uint2 a, uint2 b) {
+  uint2 ret;
+  ret.x = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.x), bit_cast<T, int>(b.x)));
+  ret.y = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.y), bit_cast<T, int>(b.y)));
+  return ret;
+}
+
+template <typename T>
+MSCCLPP_DEVICE_INLINE uint2 add_vectors(uint2 a, uint2 b) {
+  return add_vectors_helper<T>(a, b);
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE uint2 add_vectors<__half>(uint2 a, uint2 b) {
+  return add_vectors_helper<__half2>(a, b);
+}
+
+template <typename T>
+MSCCLPP_DEVICE_INLINE int add_vectors_helper(int a, int b) {
+  return bit_cast<int, T>(add_elements(bit_cast<T, int>(a), bit_cast<T, int>(b)));
+}
+
+template <typename T>
+MSCCLPP_DEVICE_INLINE int add_vectors(int a, int b) {
+  return add_vectors_helper<T>(a, b);
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE int add_vectors<__half>(int a, int b) {
+  return add_vectors_helper<__half2>(a, b);
+}
+
+template <typename T>
+MSCCLPP_DEVICE_INLINE uint32_t add_vectors_helper(uint32_t a, uint32_t b) {
+  return bit_cast<uint32_t, T>(add_elements(bit_cast<T, uint32_t>(a), bit_cast<T, uint32_t>(b)));
+}
+
+template <typename T>
+MSCCLPP_DEVICE_INLINE uint32_t add_vectors(uint32_t a, uint32_t b) {
+  return add_vectors_helper<T>(a, b);
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE uint32_t add_vectors<__half>(uint32_t a, uint32_t b) {
+  return add_vectors_helper<__half2>(a, b);
+}
+
+template <typename T>
+MSCCLPP_DEVICE_INLINE void vectorSum(T* dst, T* src, size_t nElem, int blockId, int nBlocks) {
+  size_t nInt4 = nElem / 4;
+  size_t nLastInts = nElem % 4;
+  int4* dst4 = (int4*)dst;
+  int4* src4 = (int4*)src;
+  for (size_t i = threadIdx.x + blockId * blockDim.x; i < nInt4; i += blockDim.x * nBlocks) {
+    dst4[i] = add_vectors<T>(dst4[i], src4[i]);
+  }
+  if (nLastInts > 0) {
+    int* dstLast = ((int*)dst) + nInt4 * 4;
+    int* srcLast = ((int*)src) + nInt4 * 4;
+    for (size_t i = threadIdx.x + blockId * blockDim.x; i < nLastInts; i += blockDim.x * nBlocks) {
+      dstLast[i] = add_vectors<T>(dstLast[i], srcLast[i]);
+    }
+  }
+}
+
+template <typename T>
+MSCCLPP_DEVICE_INLINE void vectorSum(T* dst, T* src, size_t nElem) {
+  vectorSum(dst, src, nElem, blockIdx.x, gridDim.x);
+}
+}  // namespace
+
 namespace mscclpp {
 
 MSCCLPP_DEVICE_INLINE void handleSignal(int tid, DeviceHandle<SmChannel>* smChannels,
@@ -37,7 +152,52 @@ MSCCLPP_DEVICE_INLINE void handleWait(int tid, DeviceHandle<SmChannel>* smChanne
   }
 }
 
-__global__ void kernel(int rank, DeviceExecutionPlan* plan) {
+template <typename T>
+MSCCLPP_DEVICE_INLINE void handleReadReduceCopySend(T* input, uint32_t inputOffsetByBytes, T* output,
+                                                    uint32_t outputOffsetByBytes, DeviceHandle<SmChannel>* smChannels,
+                                                    uint8_t* srcChannelIndex, uint8_t* dstChannelIndex,
+                                                    uint32_t* srcOffsets, uint32_t* dstOffsets, int nSrcChannels,
+                                                    int nDstChannels, uint32_t size) {
+  const size_t vectorSize = sizeof(int4) / sizeof(T);
+  const size_t nInt4 = size / sizeof(int4);
+  const size_t inputOffset4 = inputOffsetByBytes / sizeof(int4);
+  const size_t outputOffset4 = outputOffsetByBytes / sizeof(int4);
+  int4* input4 = (int4*)input;
+  int4* output4 = (int4*)output;
+  for (size_t idx = threadIdx.x; idx < nInt4; idx += blockDim.x) {
+    int4 tmp = input4[inputOffset4 + idx];
+    for (int index = 0; index < nSrcChannels; ++index) {
+      int4 val;
+      size_t srcOffset = srcOffsets[index] / sizeof(int4);
+      val = smChannels[srcChannelIndex[index]].read<int4>(srcOffset + idx);
+      tmp = add_vectors<T>(tmp, val);
+    }
+    output4[outputOffset4 + idx] = tmp;
+    for (int index = 0; index < nDstChannels; ++index) {
+      size_t dstOffset = dstOffsets[index] / sizeof(int4);
+      smChannels[dstChannelIndex[index]].write<int4>(dstOffset + idx, tmp);
+    }
+  }
+  // handle rest of data
+  size_t processed = nInt4 * sizeof(int4);
+  const size_t startIdx = (inputOffsetByBytes + processed) / sizeof(T);
+  const size_t endIdx = (inputOffsetByBytes + size) / sizeof(T);
+  for (size_t idx = threadIdx.x + startIdx; idx < endIdx; idx += blockDim.x) {
+    T tmp = input[idx];
+    for (int index = 0; index < nSrcChannels; ++index) {
+      size_t srcOffset = srcOffsets[index] / sizeof(T);
+      tmp += smChannels[srcChannelIndex[index]].read<T>(srcOffset + idx);
+    }
+    output[idx] = tmp;
+    for (int index = 0; index < nDstChannels; ++index) {
+      size_t dstOffset = dstOffsets[index] / sizeof(T);
+      smChannels[dstChannelIndex[index]].write<T>(dstOffset + idx, tmp);
+    }
+  }
+}
+
+template <typename T>
+__global__ void kernel(int rank, T* input, T* output, T* scratch, DeviceExecutionPlan* plan) {
   extern __shared__ int sharedMem[];
   int bid = blockIdx.x;
   int tid = threadIdx.x;
@@ -78,14 +238,38 @@ __global__ void kernel(int rank, DeviceExecutionPlan* plan) {
         handleWait(tid, smChannels, proxyChannels, operations[i].inputChannelIndex, operations[i].nInputChannels,
                    operations[i].channelType);
         break;
+      case OperationType::READ_REDUCE_SEND:
+        handleReadReduceCopySend(input, operations[i].srcOffset, input, operations[i].srcOffset, smChannels,
+                                 operations[i].inputChannelIndex, operations[i].outputChannelIndex,
+                                 operations[i].inputOffset, operations[i].outputOffset, operations[i].nInputChannels,
+                                 operations[i].nOutputChannels, operations[i].size);
+        break;
       default:
         break;
     }
   }
 }
 
-void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, DeviceExecutionPlan* plan,
-                                   size_t sharedMemSize, cudaStream_t stream) {
-  kernel<<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, plan);
+void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch,
+                                   DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize,
+                                   cudaStream_t stream) {
+  switch (dataType) {
+    case DataType::INT32:
+      kernel<int32_t><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (int32_t*)src, (int32_t*)dst,
+                                                                          (int32_t*)scratch, plan);
+      break;
+    case DataType::UINT32:
+      kernel<uint32_t><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (uint32_t*)src, (uint32_t*)dst,
+                                                                           (uint32_t*)scratch, plan);
+      break;
+    case DataType::FLOAT16:
+      kernel<half>
+          <<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (half*)src, (half*)dst, (half*)scratch, plan);
+      break;
+    case DataType::FLOAT32:
+      kernel<float>
+          <<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (float*)src, (float*)dst, (float*)scratch, plan);
+      break;
+  }
 }
 }  // namespace mscclpp
diff --git a/src/executor/executor.cc b/src/executor/executor.cc
index dcdebd7ec..38cce1cb4 100644
--- a/src/executor/executor.cc
+++ b/src/executor/executor.cc
@@ -255,23 +255,24 @@ struct Executor::Impl {
     context.deviceExecutionPlans = std::move(deviceExecutionPlans);
   }
 
-  void launchKernel(ExecutionContext& context, int rank, int nthreadsPerBlock, cudaStream_t stream) {
+  void launchKernel(ExecutionContext& context, int rank, int nthreadsPerBlock, void* sendbuff, void* recvbuff,
+                    DataType dataType, cudaStream_t stream) {
     int nthreadblocks = context.deviceExecutionPlans.size();
     size_t sharedMemSize = sizeof(DeviceExecutionPlan);
-    ExecutionKernel::launchKernel(rank, nthreadblocks, nthreadsPerBlock,
-                                  (DeviceExecutionPlan*)context.deviceExecutionPlansBuffer.get(), sharedMemSize,
-                                  stream);
+    ExecutionKernel::launchKernel(
+        rank, nthreadblocks, nthreadsPerBlock, sendbuff, recvbuff, (void*)context.scratchBuffer.get(), dataType,
+        (DeviceExecutionPlan*)context.deviceExecutionPlansBuffer.get(), sharedMemSize, stream);
   }
 };
 
 Executor::Executor(std::shared_ptr<Communicator> comm, int nranksPerNode)
     : impl_(std::make_unique<Impl>(comm, nranksPerNode)) {}
 
-void Executor::execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, int nthreads,
-                       const ExecutionPlan& plan, cudaStream_t stream) {
+void Executor::execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize,
+                       DataType dataType, int nthreads, const ExecutionPlan& plan, cudaStream_t stream) {
   ExecutionContext context =
       this->impl_->setupExecutionContext(rank, sendbuff, recvBuff, sendBuffSize, recvBuffSize, plan, stream);
-  this->impl_->launchKernel(context, rank, nthreads, stream);
+  this->impl_->launchKernel(context, rank, nthreads, sendbuff, recvBuff, dataType, stream);
 }
 
 Executor::~Executor() = default;
diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp
index ff79dcc39..0d86f8231 100644
--- a/src/include/execution_kernel.hpp
+++ b/src/include/execution_kernel.hpp
@@ -4,6 +4,7 @@
 #ifndef MSCCLPP_EXECUTION_KERNEL_HPP_
 #define MSCCLPP_EXECUTION_KERNEL_HPP_
 
+#include <mscclpp/executor.hpp>
 #include <mscclpp/proxy_channel.hpp>
 #include <mscclpp/sm_channel.hpp>
 
@@ -35,7 +36,9 @@ enum class OperationType : uint8_t {
   REDUCE,
   REDUCE_SEND,
   READ_REDUCE,
+  READ_REDUCE_COPY,
   READ_REDUCE_SEND,
+  READ_REDUCE_COPY_SEND,
 };
 
 struct Channels {
@@ -70,8 +73,8 @@ struct DeviceExecutionPlan {
 
 class ExecutionKernel {
  public:
-  static void launchKernel(int rank, int nthreadblocks, int nthreads, DeviceExecutionPlan* plan, size_t sharedMemSize,
-                           cudaStream_t stream);
+  static void launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch,
+                           DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, cudaStream_t stream);
 };
 
 }  // namespace mscclpp
diff --git a/test/executor_test.cc b/test/executor_test.cc
index 76a580d36..df81dbaad 100644
--- a/test/executor_test.cc
+++ b/test/executor_test.cc
@@ -34,10 +34,10 @@ int main() {
   std::shared_ptr<mscclpp::Executor> executor = std::make_shared<mscclpp::Executor>(comm, 8 /*nranksPerNode*/);
 
   mscclpp::ExecutionPlan plan(MSCCLPP_ROOT_PATH + "/test/execution-files/allreduce.json");
-  std::shared_ptr<char> sendbuff = mscclpp::allocExtSharedCuda<char>(1024);
-  std::shared_ptr<char> recvbuff = mscclpp::allocExtSharedCuda<char>(1024);
+  std::shared_ptr<char> sendbuff = mscclpp::allocExtSharedCuda<char>(1024 * 1024);
   mscclpp::CudaStreamWithFlags stream(cudaStreamNonBlocking);
-  executor->execute(rank, sendbuff.get(), recvbuff.get(), 1024, 1024, 512, plan, stream);
+  executor->execute(rank, sendbuff.get(), sendbuff.get(), 1024 * 1024, 1024 * 1024, mscclpp::DataType::FLOAT16, 512,
+                    plan, stream);
   CUDACHECK(cudaStreamSynchronize(stream));
 
   MPI_Finalize();

From 36d31db9de8cf25169f024598f83c402913951b3 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Thu, 4 Apr 2024 10:10:14 +0000
Subject: [PATCH 21/51] fix channel bugs

---
 src/executor/execution_kernel.cu    | 34 ++++++++++++++++++++---------
 src/executor/execution_plan.cc      | 24 ++++++++++----------
 src/executor/executor.cc            | 16 ++++++++------
 src/include/execution_kernel.hpp    | 10 ++++-----
 test/execution-files/allreduce.json | 20 +++++++++++++----
 5 files changed, 66 insertions(+), 38 deletions(-)

diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu
index 3d2304735..b781c71a9 100644
--- a/src/executor/execution_kernel.cu
+++ b/src/executor/execution_kernel.cu
@@ -126,6 +126,20 @@ MSCCLPP_DEVICE_INLINE void vectorSum(T* dst, T* src, size_t nElem) {
 
 namespace mscclpp {
 
+template <typename T>
+MSCCLPP_DEVICE_INLINE T* getBuffer(T* input, T* output, T* scratch, BufferType bufferType) {
+  if (bufferType == BufferType::INPUT) {
+    return input;
+  }
+  if (bufferType == BufferType::OUTPUT) {
+    return output;
+  }
+  if (bufferType == BufferType::SCRATCH) {
+    return scratch;
+  }
+  return nullptr;
+}
+
 MSCCLPP_DEVICE_INLINE void handleSignal(int tid, DeviceHandle<SmChannel>* smChannels,
                                         DeviceHandle<SimpleProxyChannel>* proxyChannels, uint8_t* channelIndex,
                                         int nChannels, ChannelType chType) {
@@ -158,7 +172,6 @@ MSCCLPP_DEVICE_INLINE void handleReadReduceCopySend(T* input, uint32_t inputOffs
                                                     uint8_t* srcChannelIndex, uint8_t* dstChannelIndex,
                                                     uint32_t* srcOffsets, uint32_t* dstOffsets, int nSrcChannels,
                                                     int nDstChannels, uint32_t size) {
-  const size_t vectorSize = sizeof(int4) / sizeof(T);
   const size_t nInt4 = size / sizeof(int4);
   const size_t inputOffset4 = inputOffsetByBytes / sizeof(int4);
   const size_t outputOffset4 = outputOffsetByBytes / sizeof(int4);
@@ -213,9 +226,8 @@ __global__ void kernel(int rank, T* input, T* output, T* scratch, DeviceExecutio
   Operation* operations = localPlan->operations;
   DeviceHandle<SmChannel>* smChannels = localPlan->channels.smChannels;
   DeviceHandle<SimpleProxyChannel>* proxyChannels = localPlan->channels.proxyChannels;
-  if (bid > 0) {
-    return;
-  }
+  T* src = nullptr;
+  T* dst = nullptr;
   for (int i = 0; i < localPlan->nOperations; i++) {
     switch (operations[i].type) {
       case OperationType::BARRIER:
@@ -226,7 +238,7 @@ __global__ void kernel(int rank, T* input, T* output, T* scratch, DeviceExecutio
         //   printf("rank: %d bid: %d, noutputchannels: %d outputChannelIndex %d\n", rank, bid,
         //          operations[i].nOutputChannels, operations[i].outputChannelIndex[0]);
         // }
-        handleSignal(tid, smChannels, proxyChannels, operations[i].outputChannelIndex, operations[i].nOutputChannels,
+        handleSignal(tid, smChannels, proxyChannels, operations[i].outputChannelIndexes, operations[i].nOutputChannels,
                      operations[i].channelType);
         break;
       case OperationType::WAIT:
@@ -235,13 +247,15 @@ __global__ void kernel(int rank, T* input, T* output, T* scratch, DeviceExecutio
         //   operations[i].nInputChannels,
         //          operations[i].inputChannelIndex[0]);
         // }
-        handleWait(tid, smChannels, proxyChannels, operations[i].inputChannelIndex, operations[i].nInputChannels,
+        handleWait(tid, smChannels, proxyChannels, operations[i].inputChannelIndexes, operations[i].nInputChannels,
                    operations[i].channelType);
         break;
-      case OperationType::READ_REDUCE_SEND:
-        handleReadReduceCopySend(input, operations[i].srcOffset, input, operations[i].srcOffset, smChannels,
-                                 operations[i].inputChannelIndex, operations[i].outputChannelIndex,
-                                 operations[i].inputOffset, operations[i].outputOffset, operations[i].nInputChannels,
+      case OperationType::READ_REDUCE_COPY_SEND:
+        src = getBuffer(input, output, scratch, operations[i].srcBufferType);
+        dst = getBuffer(input, output, scratch, operations[i].dstBufferType);
+        handleReadReduceCopySend(src, operations[i].srcOffset, dst, operations[i].dstOffset, smChannels,
+                                 operations[i].inputChannelIndexes, operations[i].outputChannelIndexes,
+                                 operations[i].inputOffsets, operations[i].outputOffsets, operations[i].nInputChannels,
                                  operations[i].nOutputChannels, operations[i].size);
         break;
       default:
diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc
index 7753bc0b9..4a64e86e6 100644
--- a/src/executor/execution_plan.cc
+++ b/src/executor/execution_plan.cc
@@ -33,10 +33,10 @@ auto getOpType = [](const std::string& str) {
     return mscclpp::OperationType::REDUCE;
   } else if (str == "rs") {
     return mscclpp::OperationType::REDUCE_SEND;
-  } else if (str == "rr") {
-    return mscclpp::OperationType::READ_REDUCE;
-  } else if (str == "rrs") {
-    return mscclpp::OperationType::READ_REDUCE_SEND;
+  } else if (str == "rrc") {
+    return mscclpp::OperationType::READ_REDUCE_COPY;
+  } else if (str == "rrcs") {
+    return mscclpp::OperationType::READ_REDUCE_COPY_SEND;
   } else {
     throw std::runtime_error("Invalid operation type");
   }
@@ -153,10 +153,12 @@ void ExecutionPlan::Impl::setupChannels(const json& gpus) {
     std::unordered_map<ChannelKey, std::vector<int>> channelMap;
     for (auto channelType : channelTypes) {
       const std::vector<ChannelInfo> channelInfos = this->getChannelInfos(rank, channelType);
-      for (size_t i = 0; i < channelInfos.size(); i++) {
-        const ChannelInfo& info = channelInfos[i];
+      int index = 0;
+      for (const auto& info : channelInfos) {
         ChannelKey key = {info.srcBufferType, info.dstBufferType, info.channelType};
-        channelMap[key].push_back(i);
+        for (size_t i = 0; i < info.connectedPeers.size(); i++) {
+          channelMap[key].push_back(index++);
+        }
       }
     }
     int nthreadblocks = gpu["threadblocks"].size();
@@ -211,16 +213,16 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus) {
         for (int i = 0; i < operation.nInputChannels; i++) {
           BufferType srcBufferType = convertToBufferType(op["i_buff"]["src"]);
           BufferType dstBufferType = convertToBufferType(op["i_buff"]["dst"]);
-          operation.inputChannelIndex[i] =
+          operation.inputChannelIndexes[i] =
               channelIndexes[{srcBufferType, dstBufferType, operation.channelType}][op["i_cids"][i]["id"]];
-          operation.inputOffset[i] = this->chunkSize * (int)op["i_cids"][i]["off"];
+          operation.inputOffsets[i] = this->chunkSize * (int)op["i_cids"][i]["off"];
         }
         for (int i = 0; i < operation.nOutputChannels; i++) {
           BufferType srcBufferType = convertToBufferType(op["o_buff"]["src"]);
           BufferType dstBufferType = convertToBufferType(op["o_buff"]["dst"]);
-          operation.outputChannelIndex[i] =
+          operation.outputChannelIndexes[i] =
               channelIndexes[{srcBufferType, dstBufferType, operation.channelType}][op["o_cids"][i]["id"]];
-          operation.outputOffset[i] = this->chunkSize * (int)op["o_cids"][i]["off"];
+          operation.outputOffsets[i] = this->chunkSize * (int)op["o_cids"][i]["off"];
         }
         if (op.contains("srcbuff")) {
           operation.srcBufferType = convertToBufferType(op["srcbuff"]);
diff --git a/src/executor/executor.cc b/src/executor/executor.cc
index 38cce1cb4..1d6d9305e 100644
--- a/src/executor/executor.cc
+++ b/src/executor/executor.cc
@@ -220,11 +220,11 @@ struct Executor::Impl {
         RegisteredMemory localMemory = this->comm->registerMemory(src, sendBufferSize, transport);
         for (int peer : info.connectedPeers) {
           if (channelType == ChannelType::SM) {
-            context.smChannels.emplace_back(context.smSemaphores[index],
+            context.smChannels.emplace_back(context.smSemaphores[index++],
                                             context.registeredMemories[{info.dstBufferType, peer}], src, nullptr);
           } else if (channelType == ChannelType::PROXY) {
             context.proxyChannels.emplace_back(
-                this->proxyService->proxyChannel(context.proxySemaphores[index]),
+                this->proxyService->proxyChannel(context.proxySemaphores[index++]),
                 this->proxyService->addMemory(context.registeredMemories[{info.dstBufferType, peer}]),
                 this->proxyService->addMemory(localMemory));
           }
@@ -236,16 +236,18 @@ struct Executor::Impl {
   void setupDeviceExecutionPlan(ExecutionContext& context, int rank, const ExecutionPlan& plan) {
     std::vector<DeviceExecutionPlan> deviceExecutionPlans;
     for (int threadblock = 0; threadblock < plan.impl_->getThreadblockCount(rank); threadblock++) {
-      DeviceExecutionPlan deviceExecutionPlan;
+      DeviceExecutionPlan deviceExecutionPlan = {};
       std::vector<Operation> ops = plan.impl_->getOperations(rank, threadblock);
       deviceExecutionPlan.nOperations = ops.size();
       deviceExecutionPlan.nSmChannels = plan.impl_->threadblockSMChannelMap.at(rank).at(threadblock).size();
       deviceExecutionPlan.nProxyChannels = plan.impl_->threadblockProxyChannelMap.at(rank).at(threadblock).size();
-      for (const auto& [index, key] : plan.impl_->threadblockSMChannelMap.at(rank).at(threadblock)) {
-        deviceExecutionPlan.channels.smChannels[index] = mscclpp::deviceHandle(context.smChannels[index]);
+      int chanIndex = 0;
+      for (const auto& [index, _] : plan.impl_->threadblockSMChannelMap.at(rank).at(threadblock)) {
+        deviceExecutionPlan.channels.smChannels[chanIndex++] = mscclpp::deviceHandle(context.smChannels[index]);
       }
-      for (const auto& [index, key] : plan.impl_->threadblockProxyChannelMap.at(rank).at(threadblock)) {
-        deviceExecutionPlan.channels.proxyChannels[index] = mscclpp::deviceHandle(context.proxyChannels[index]);
+      chanIndex = 0;
+      for (const auto& [index, _] : plan.impl_->threadblockProxyChannelMap.at(rank).at(threadblock)) {
+        deviceExecutionPlan.channels.proxyChannels[chanIndex++] = mscclpp::deviceHandle(context.proxyChannels[index]);
       }
       for (size_t i = 0; i < ops.size(); i++) {
         deviceExecutionPlan.operations[i] = ops[i];
diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp
index 0d86f8231..f1934b567 100644
--- a/src/include/execution_kernel.hpp
+++ b/src/include/execution_kernel.hpp
@@ -35,9 +35,7 @@ enum class OperationType : uint8_t {
   FLUSH,
   REDUCE,
   REDUCE_SEND,
-  READ_REDUCE,
   READ_REDUCE_COPY,
-  READ_REDUCE_SEND,
   READ_REDUCE_COPY_SEND,
 };
 
@@ -53,10 +51,10 @@ struct Operation {
   BufferType dstBufferType;
   uint8_t nInputChannels;
   uint8_t nOutputChannels;
-  uint8_t inputChannelIndex[MAX_CHANNEL_PER_OPERATION];
-  uint8_t outputChannelIndex[MAX_CHANNEL_PER_OPERATION];
-  uint32_t inputOffset[MAX_CHANNEL_PER_OPERATION];
-  uint32_t outputOffset[MAX_CHANNEL_PER_OPERATION];
+  uint8_t inputChannelIndexes[MAX_CHANNEL_PER_OPERATION];
+  uint8_t outputChannelIndexes[MAX_CHANNEL_PER_OPERATION];
+  uint32_t inputOffsets[MAX_CHANNEL_PER_OPERATION];
+  uint32_t outputOffsets[MAX_CHANNEL_PER_OPERATION];
   uint32_t srcOffset;
   uint32_t dstOffset;
   uint32_t size;
diff --git a/test/execution-files/allreduce.json b/test/execution-files/allreduce.json
index b2032e90b..67e393fac 100644
--- a/test/execution-files/allreduce.json
+++ b/test/execution-files/allreduce.json
@@ -53,7 +53,7 @@
               ]
             },
             {
-              "name": "rrs",
+              "name": "rrcs",
               "i_buff": {
                 "src": "i",
                 "dst": "i"
@@ -74,6 +74,9 @@
                   "off": 0
                 }
               ],
+              "src": 0,
+              "srcbuff": "i",
+              "srcoff": 0,
               "dst": 0,
               "dstbuff": "i",
               "dstoff": 0,
@@ -174,7 +177,7 @@
               ]
             },
             {
-              "name": "rrs",
+              "name": "rrcs",
               "i_buff": {
                 "src": "i",
                 "dst": "i"
@@ -195,6 +198,9 @@
                   "off": 0
                 }
               ],
+              "src": 0,
+              "srcbuff": "i",
+              "srcoff": 1,
               "dst": 0,
               "dstbuff": "i",
               "dstoff": 1,
@@ -314,7 +320,7 @@
               ]
             },
             {
-              "name": "rrs",
+              "name": "rrcs",
               "i_buff": {
                 "src": "i",
                 "dst": "i"
@@ -335,6 +341,9 @@
                   "off": 2
                 }
               ],
+              "src": 1,
+              "srcbuff": "i",
+              "srcoff": 2,
               "dst": 1,
               "dstbuff": "i",
               "dstoff": 2,
@@ -435,7 +444,7 @@
               ]
             },
             {
-              "name": "rrs",
+              "name": "rrcs",
               "i_buff": {
                 "src": "i",
                 "dst": "i"
@@ -456,6 +465,9 @@
                   "off": 2
                 }
               ],
+              "src": 1,
+              "srcbuff": "i",
+              "srcoff": 3,
               "dst": 1,
               "dstbuff": "i",
               "dstoff": 3,

From 37c2d7da623df6adca3c9e0863a2ae8df6994213 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Thu, 4 Apr 2024 10:37:55 +0000
Subject: [PATCH 22/51] minor

---
 src/executor/execution_kernel.cu    | 37 +++--------------------------
 test/execution-files/allreduce.json | 12 +++++-----
 2 files changed, 9 insertions(+), 40 deletions(-)

diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu
index b781c71a9..8a07870de 100644
--- a/src/executor/execution_kernel.cu
+++ b/src/executor/execution_kernel.cu
@@ -66,7 +66,7 @@ MSCCLPP_DEVICE_INLINE uint2 add_vectors(uint2 a, uint2 b) {
 }
 
 template <>
-MSCCLPP_DEVICE_INLINE uint2 add_vectors<__half>(uint2 a, uint2 b) {
+MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint2 add_vectors<__half>(uint2 a, uint2 b) {
   return add_vectors_helper<__half2>(a, b);
 }
 
@@ -81,7 +81,7 @@ MSCCLPP_DEVICE_INLINE int add_vectors(int a, int b) {
 }
 
 template <>
-MSCCLPP_DEVICE_INLINE int add_vectors<__half>(int a, int b) {
+MSCCLPP_DEVICE_INLINE __attribute__((unused)) int add_vectors<__half>(int a, int b) {
   return add_vectors_helper<__half2>(a, b);
 }
 
@@ -96,32 +96,10 @@ MSCCLPP_DEVICE_INLINE uint32_t add_vectors(uint32_t a, uint32_t b) {
 }
 
 template <>
-MSCCLPP_DEVICE_INLINE uint32_t add_vectors<__half>(uint32_t a, uint32_t b) {
+MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint32_t add_vectors<__half>(uint32_t a, uint32_t b) {
   return add_vectors_helper<__half2>(a, b);
 }
 
-template <typename T>
-MSCCLPP_DEVICE_INLINE void vectorSum(T* dst, T* src, size_t nElem, int blockId, int nBlocks) {
-  size_t nInt4 = nElem / 4;
-  size_t nLastInts = nElem % 4;
-  int4* dst4 = (int4*)dst;
-  int4* src4 = (int4*)src;
-  for (size_t i = threadIdx.x + blockId * blockDim.x; i < nInt4; i += blockDim.x * nBlocks) {
-    dst4[i] = add_vectors<T>(dst4[i], src4[i]);
-  }
-  if (nLastInts > 0) {
-    int* dstLast = ((int*)dst) + nInt4 * 4;
-    int* srcLast = ((int*)src) + nInt4 * 4;
-    for (size_t i = threadIdx.x + blockId * blockDim.x; i < nLastInts; i += blockDim.x * nBlocks) {
-      dstLast[i] = add_vectors<T>(dstLast[i], srcLast[i]);
-    }
-  }
-}
-
-template <typename T>
-MSCCLPP_DEVICE_INLINE void vectorSum(T* dst, T* src, size_t nElem) {
-  vectorSum(dst, src, nElem, blockIdx.x, gridDim.x);
-}
 }  // namespace
 
 namespace mscclpp {
@@ -234,19 +212,10 @@ __global__ void kernel(int rank, T* input, T* output, T* scratch, DeviceExecutio
         __syncthreads();
         break;
       case OperationType::SIGNAL:
-        // if (tid == 0) {
-        //   printf("rank: %d bid: %d, noutputchannels: %d outputChannelIndex %d\n", rank, bid,
-        //          operations[i].nOutputChannels, operations[i].outputChannelIndex[0]);
-        // }
         handleSignal(tid, smChannels, proxyChannels, operations[i].outputChannelIndexes, operations[i].nOutputChannels,
                      operations[i].channelType);
         break;
       case OperationType::WAIT:
-        // if (tid == 0) {
-        //   printf("rank: %d bid: %d, ninputchannels: %d inputChannelIndex %d\n", rank, bid,
-        //   operations[i].nInputChannels,
-        //          operations[i].inputChannelIndex[0]);
-        // }
         handleWait(tid, smChannels, proxyChannels, operations[i].inputChannelIndexes, operations[i].nInputChannels,
                    operations[i].channelType);
         break;
diff --git a/test/execution-files/allreduce.json b/test/execution-files/allreduce.json
index 67e393fac..a511316ab 100644
--- a/test/execution-files/allreduce.json
+++ b/test/execution-files/allreduce.json
@@ -195,7 +195,7 @@
               "o_cids": [
                 {
                   "id": 0,
-                  "off": 0
+                  "off": 1
                 }
               ],
               "src": 0,
@@ -225,7 +225,7 @@
               "o_cids": [
                 {
                   "id": 0,
-                  "off": 0
+                  "off": 1
                 }
               ],
               "ctype": "sm",
@@ -240,7 +240,7 @@
               "i_cids": [
                 {
                   "id": 0,
-                  "off": 2
+                  "off": 3
                 }
               ],
               "ctype": "sm",
@@ -462,7 +462,7 @@
               "o_cids": [
                 {
                   "id": 0,
-                  "off": 2
+                  "off": 3
                 }
               ],
               "src": 1,
@@ -492,7 +492,7 @@
               "o_cids": [
                 {
                   "id": 0,
-                  "off": 2
+                  "off": 3
                 }
               ],
               "ctype": "sm",
@@ -507,7 +507,7 @@
               "i_cids": [
                 {
                   "id": 0,
-                  "off": 0
+                  "off": 1
                 }
               ],
               "ctype": "sm",

From 8c7978016a4c5db827a90941c2ebdbdfe18e0dab Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Thu, 4 Apr 2024 15:09:23 +0000
Subject: [PATCH 23/51] add python binding

---
 python/mscclpp/__init__.py   |  3 +++
 python/mscclpp/core_py.cpp   |  2 ++
 python/mscclpp/executor.cpp  | 34 +++++++++++++++++++++++++
 python/test/executor_test.py | 49 ++++++++++++++++++++++++++++++++++++
 test/executor_test.cc        |  3 ++-
 5 files changed, 90 insertions(+), 1 deletion(-)
 create mode 100644 python/mscclpp/executor.cpp
 create mode 100644 python/test/executor_test.py

diff --git a/python/mscclpp/__init__.py b/python/mscclpp/__init__.py
index 8f013e080..0c8f7eb3b 100644
--- a/python/mscclpp/__init__.py
+++ b/python/mscclpp/__init__.py
@@ -19,6 +19,9 @@
     TcpBootstrap,
     Transport,
     TransportFlags,
+    DataType,
+    Executor,
+    ExecutionPlan,
     version,
     is_nvls_supported,
 )
diff --git a/python/mscclpp/core_py.cpp b/python/mscclpp/core_py.cpp
index 1a1cd2780..3f78dad35 100644
--- a/python/mscclpp/core_py.cpp
+++ b/python/mscclpp/core_py.cpp
@@ -20,6 +20,7 @@ extern void register_fifo(nb::module_& m);
 extern void register_semaphore(nb::module_& m);
 extern void register_utils(nb::module_& m);
 extern void register_numa(nb::module_& m);
+extern void register_executor(nb::module_& m);
 
 template <typename T>
 void def_nonblocking_future(nb::handle& m, const std::string& typestr) {
@@ -204,4 +205,5 @@ NB_MODULE(_mscclpp, m) {
   register_utils(m);
   register_core(m);
   register_numa(m);
+  register_executor(m);
 }
diff --git a/python/mscclpp/executor.cpp b/python/mscclpp/executor.cpp
new file mode 100644
index 000000000..5276e3336
--- /dev/null
+++ b/python/mscclpp/executor.cpp
@@ -0,0 +1,34 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/shared_ptr.h>
+#include <nanobind/stl/string.h>
+
+#include <mscclpp/executor.hpp>
+#include <mscclpp/gpu.hpp>
+
+namespace nb = nanobind;
+using namespace mscclpp;
+
+void register_executor(nb::module_& m) {
+  nb::enum_<DataType>(m, "DataType")
+      .value("int32", DataType::INT32)
+      .value("uint32", DataType::UINT32)
+      .value("float16", DataType::FLOAT16)
+      .value("float32", DataType::FLOAT32);
+
+  nb::class_<ExecutionPlan>(m, "ExecutionPlan").def(nb::init<std::string>(), nb::arg("planPath"));
+
+  nb::class_<Executor>(m, "Executor")
+      .def(nb::init<std::shared_ptr<Communicator>, int>(), nb::arg("comm"), nb::arg("nranksPerNode"))
+      .def(
+          "execute",
+          [](Executor* self, int rank, uintptr_t sendbuff, uintptr_t recvBuff, size_t sendBuffSize, size_t recvBuffSize,
+             DataType dataType, int nthreads, const ExecutionPlan& plan, uintptr_t stream) {
+            self->execute(rank, reinterpret_cast<void*>(sendbuff), reinterpret_cast<void*>(recvBuff), sendBuffSize,
+                          recvBuffSize, dataType, nthreads, plan, (cudaStream_t)stream);
+          },
+          nb::arg("rank"), nb::arg("sendbuff"), nb::arg("recvBuff"), nb::arg("sendBuffSize"), nb::arg("recvBuffSize"),
+          nb::arg("dataType"), nb::arg("nthreads"), nb::arg("plan"), nb::arg("stream"));
+}
diff --git a/python/test/executor_test.py b/python/test/executor_test.py
new file mode 100644
index 000000000..b896bf102
--- /dev/null
+++ b/python/test/executor_test.py
@@ -0,0 +1,49 @@
+from os import path
+from mscclpp import (
+    DataType,
+    Executor,
+    ExecutionPlan,
+)
+import mscclpp.comm as mscclpp_comm
+
+import cupy as cp
+from mpi4py import MPI
+
+MSCCLPP_ROOT_PATH = "/root/mscclpp"
+
+if __name__ == "__main__":
+    shm_comm = MPI.COMM_WORLD.Split_type(MPI.COMM_TYPE_SHARED, 0, MPI.INFO_NULL)
+    N_GPUS_PER_NODE = shm_comm.size
+    shm_comm.Free()
+
+    cp.cuda.Device(MPI.COMM_WORLD.rank % N_GPUS_PER_NODE).use()
+    mscclpp_group = mscclpp_comm.CommGroup(MPI.COMM_WORLD)
+    executor = Executor(mscclpp_group.communicator, N_GPUS_PER_NODE)
+    execution_plan = ExecutionPlan(path.join(MSCCLPP_ROOT_PATH, "test", "execution-files", "allreduce.json"))
+
+    buffer_size = 1024 * 1024
+    cp.random.seed(42)
+    buffer = cp.random.random(buffer_size).astype(cp.float16)
+    sub_arrays = cp.split(buffer, MPI.COMM_WORLD.size)
+    sendbuf = sub_arrays[MPI.COMM_WORLD.rank]
+
+    expected = cp.zeros_like(sendbuf)
+    for i in range(MPI.COMM_WORLD.size):
+        expected += sub_arrays[i]
+
+    stream = cp.cuda.Stream(non_blocking=True)
+    executor.execute(
+        MPI.COMM_WORLD.rank,
+        sendbuf.data.ptr,
+        sendbuf.data.ptr,
+        buffer_size,
+        buffer_size,
+        DataType.float16,
+        512,
+        execution_plan,
+        stream.ptr,
+    )
+    stream.synchronize()
+    assert cp.allclose(sendbuf, expected, atol=1e-3)
+    executor = None
+    mscclpp_group = None
diff --git a/test/executor_test.cc b/test/executor_test.cc
index df81dbaad..c58573ce8 100644
--- a/test/executor_test.cc
+++ b/test/executor_test.cc
@@ -31,8 +31,9 @@ int main() {
   // sleep 20s
   // std::this_thread::sleep_for(std::chrono::seconds(20));
   auto comm = std::make_shared<mscclpp::Communicator>(bootstrap);
-  std::shared_ptr<mscclpp::Executor> executor = std::make_shared<mscclpp::Executor>(comm, 8 /*nranksPerNode*/);
+  CUDACHECK(cudaSetDevice(rank));
 
+  std::shared_ptr<mscclpp::Executor> executor = std::make_shared<mscclpp::Executor>(comm, 8 /*nranksPerNode*/);
   mscclpp::ExecutionPlan plan(MSCCLPP_ROOT_PATH + "/test/execution-files/allreduce.json");
   std::shared_ptr<char> sendbuff = mscclpp::allocExtSharedCuda<char>(1024 * 1024);
   mscclpp::CudaStreamWithFlags stream(cudaStreamNonBlocking);

From 71b62246bf52c213a39b3de922fa6ad5e2e47365 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Fri, 5 Apr 2024 03:21:23 +0000
Subject: [PATCH 24/51] Fix

---
 python/test/executor_test.py | 10 +++++-----
 test/executor_test.cc        |  7 ++++---
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/python/test/executor_test.py b/python/test/executor_test.py
index b896bf102..a777c3546 100644
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -21,9 +21,9 @@
     executor = Executor(mscclpp_group.communicator, N_GPUS_PER_NODE)
     execution_plan = ExecutionPlan(path.join(MSCCLPP_ROOT_PATH, "test", "execution-files", "allreduce.json"))
 
-    buffer_size = 1024 * 1024
+    nelems = 1024 * 1024
     cp.random.seed(42)
-    buffer = cp.random.random(buffer_size).astype(cp.float16)
+    buffer = cp.random.random(nelems).astype(cp.float16)
     sub_arrays = cp.split(buffer, MPI.COMM_WORLD.size)
     sendbuf = sub_arrays[MPI.COMM_WORLD.rank]
 
@@ -36,14 +36,14 @@
         MPI.COMM_WORLD.rank,
         sendbuf.data.ptr,
         sendbuf.data.ptr,
-        buffer_size,
-        buffer_size,
+        sendbuf.nbytes,
+        sendbuf.nbytes,
         DataType.float16,
         512,
         execution_plan,
         stream.ptr,
     )
     stream.synchronize()
-    assert cp.allclose(sendbuf, expected, atol=1e-3)
+    assert cp.allclose(sendbuf, expected, atol=1e-3 * MPI.COMM_WORLD.size)
     executor = None
     mscclpp_group = None
diff --git a/test/executor_test.cc b/test/executor_test.cc
index c58573ce8..4a7b36a79 100644
--- a/test/executor_test.cc
+++ b/test/executor_test.cc
@@ -35,10 +35,11 @@ int main() {
 
   std::shared_ptr<mscclpp::Executor> executor = std::make_shared<mscclpp::Executor>(comm, 8 /*nranksPerNode*/);
   mscclpp::ExecutionPlan plan(MSCCLPP_ROOT_PATH + "/test/execution-files/allreduce.json");
-  std::shared_ptr<char> sendbuff = mscclpp::allocExtSharedCuda<char>(1024 * 1024);
+  const int bufferSize = 1024 * 1024;
+  std::shared_ptr<char> sendbuff = mscclpp::allocExtSharedCuda<char>(bufferSize);
   mscclpp::CudaStreamWithFlags stream(cudaStreamNonBlocking);
-  executor->execute(rank, sendbuff.get(), sendbuff.get(), 1024 * 1024, 1024 * 1024, mscclpp::DataType::FLOAT16, 512,
-                    plan, stream);
+  executor->execute(rank, sendbuff.get(), sendbuff.get(), bufferSize, bufferSize, mscclpp::DataType::FLOAT16, 512, plan,
+                    stream);
   CUDACHECK(cudaStreamSynchronize(stream));
 
   MPI_Finalize();

From 2eb6426f79cb3e665af869b9cae170bf43bedfc9 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Fri, 5 Apr 2024 03:40:13 +0000
Subject: [PATCH 25/51] update test json

---
 test/execution-files/allreduce.json | 516 +++++++++++++++++++++++++++-
 1 file changed, 508 insertions(+), 8 deletions(-)

diff --git a/test/execution-files/allreduce.json b/test/execution-files/allreduce.json
index a511316ab..739b8e6ab 100644
--- a/test/execution-files/allreduce.json
+++ b/test/execution-files/allreduce.json
@@ -6,7 +6,7 @@
   "gpus": [
     {
       "id": 0,
-      "inputChunks": 4,
+      "inputChunks": 8,
       "outputChunks": 0,
       "scratchChunks": 0,
       "threadblocks": [
@@ -136,6 +136,130 @@
         },
         {
           "id": 1,
+          "ops": [
+            {
+              "name": "signal",
+              "o_buff": {
+                "src": "i",
+                "dst": "i"
+              },
+              "o_cids": [
+                {
+                  "id": 0,
+                  "off": 6
+                }
+              ],
+              "ctype": "sm",
+              "cnt": 1
+            },
+            {
+              "name": "wait",
+              "i_buff": {
+                "src": "i",
+                "dst": "i"
+              },
+              "i_cids": [
+                {
+                  "id": 0,
+                  "off": 4
+                }
+              ],
+              "ctype": "sm",
+              "cnt": 1
+            },
+            {
+              "name": "nop",
+              "deps": [
+                {
+                  "tb": 1,
+                  "step": 1
+                }
+              ]
+            },
+            {
+              "name": "rrcs",
+              "i_buff": {
+                "src": "i",
+                "dst": "i"
+              },
+              "i_cids": [
+                {
+                  "id": 0,
+                  "off": 4
+                }
+              ],
+              "o_buff": {
+                "src": "i",
+                "dst": "i"
+              },
+              "o_cids": [
+                {
+                  "id": 0,
+                  "off": 4
+                }
+              ],
+              "src": 0,
+              "srcbuff": "i",
+              "srcoff": 4,
+              "dst": 0,
+              "dstbuff": "i",
+              "dstoff": 4,
+              "ctype": "sm",
+              "cnt": 1
+            },
+            {
+              "name": "nop",
+              "deps": [
+                {
+                  "tb": 1,
+                  "step": 3
+                }
+              ]
+            },
+            {
+              "name": "signal",
+              "o_buff": {
+                "src": "i",
+                "dst": "i"
+              },
+              "o_cids": [
+                {
+                  "id": 0,
+                  "off": 4
+                }
+              ],
+              "ctype": "sm",
+              "cnt": 1
+            },
+            {
+              "name": "wait",
+              "i_buff": {
+                "src": "i",
+                "dst": "i"
+              },
+              "i_cids": [
+                {
+                  "id": 0,
+                  "off": 6
+                }
+              ],
+              "ctype": "sm",
+              "cnt": 1
+            }
+          ],
+          "channels": [
+            {
+              "src": "i",
+              "dst": "i",
+              "ctype": "sm",
+              "cids": [
+                1
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
           "ops": [
             {
               "name": "signal",
@@ -171,7 +295,7 @@
               "name": "nop",
               "deps": [
                 {
-                  "tb": 1,
+                  "tb": 2,
                   "step": 1
                 }
               ]
@@ -211,7 +335,7 @@
               "name": "nop",
               "deps": [
                 {
-                  "tb": 1,
+                  "tb": 2,
                   "step": 3
                 }
               ]
@@ -253,7 +377,131 @@
               "dst": "i",
               "ctype": "sm",
               "cids": [
-                1
+                2
+              ]
+            }
+          ]
+        },
+        {
+          "id": 3,
+          "ops": [
+            {
+              "name": "signal",
+              "o_buff": {
+                "src": "i",
+                "dst": "i"
+              },
+              "o_cids": [
+                {
+                  "id": 0,
+                  "off": 7
+                }
+              ],
+              "ctype": "sm",
+              "cnt": 1
+            },
+            {
+              "name": "wait",
+              "i_buff": {
+                "src": "i",
+                "dst": "i"
+              },
+              "i_cids": [
+                {
+                  "id": 0,
+                  "off": 5
+                }
+              ],
+              "ctype": "sm",
+              "cnt": 1
+            },
+            {
+              "name": "nop",
+              "deps": [
+                {
+                  "tb": 3,
+                  "step": 1
+                }
+              ]
+            },
+            {
+              "name": "rrcs",
+              "i_buff": {
+                "src": "i",
+                "dst": "i"
+              },
+              "i_cids": [
+                {
+                  "id": 0,
+                  "off": 5
+                }
+              ],
+              "o_buff": {
+                "src": "i",
+                "dst": "i"
+              },
+              "o_cids": [
+                {
+                  "id": 0,
+                  "off": 5
+                }
+              ],
+              "src": 0,
+              "srcbuff": "i",
+              "srcoff": 5,
+              "dst": 0,
+              "dstbuff": "i",
+              "dstoff": 5,
+              "ctype": "sm",
+              "cnt": 1
+            },
+            {
+              "name": "nop",
+              "deps": [
+                {
+                  "tb": 3,
+                  "step": 3
+                }
+              ]
+            },
+            {
+              "name": "signal",
+              "o_buff": {
+                "src": "i",
+                "dst": "i"
+              },
+              "o_cids": [
+                {
+                  "id": 0,
+                  "off": 5
+                }
+              ],
+              "ctype": "sm",
+              "cnt": 1
+            },
+            {
+              "name": "wait",
+              "i_buff": {
+                "src": "i",
+                "dst": "i"
+              },
+              "i_cids": [
+                {
+                  "id": 0,
+                  "off": 7
+                }
+              ],
+              "ctype": "sm",
+              "cnt": 1
+            }
+          ],
+          "channels": [
+            {
+              "src": "i",
+              "dst": "i",
+              "ctype": "sm",
+              "cids": [
+                3
               ]
             }
           ]
@@ -265,6 +513,8 @@
           "dstbuff": "i",
           "type": "sm",
           "connectedTo": [
+            1,
+            1,
             1,
             1
           ]
@@ -273,7 +523,7 @@
     },
     {
       "id": 1,
-      "inputChunks": 4,
+      "inputChunks": 8,
       "outputChunks": 0,
       "scratchChunks": 0,
       "threadblocks": [
@@ -403,6 +653,130 @@
         },
         {
           "id": 1,
+          "ops": [
+            {
+              "name": "signal",
+              "o_buff": {
+                "src": "i",
+                "dst": "i"
+              },
+              "o_cids": [
+                {
+                  "id": 0,
+                  "off": 4
+                }
+              ],
+              "ctype": "sm",
+              "cnt": 1
+            },
+            {
+              "name": "wait",
+              "i_buff": {
+                "src": "i",
+                "dst": "i"
+              },
+              "i_cids": [
+                {
+                  "id": 0,
+                  "off": 6
+                }
+              ],
+              "ctype": "sm",
+              "cnt": 1
+            },
+            {
+              "name": "nop",
+              "deps": [
+                {
+                  "tb": 1,
+                  "step": 1
+                }
+              ]
+            },
+            {
+              "name": "rrcs",
+              "i_buff": {
+                "src": "i",
+                "dst": "i"
+              },
+              "i_cids": [
+                {
+                  "id": 0,
+                  "off": 6
+                }
+              ],
+              "o_buff": {
+                "src": "i",
+                "dst": "i"
+              },
+              "o_cids": [
+                {
+                  "id": 0,
+                  "off": 6
+                }
+              ],
+              "src": 1,
+              "srcbuff": "i",
+              "srcoff": 6,
+              "dst": 1,
+              "dstbuff": "i",
+              "dstoff": 6,
+              "ctype": "sm",
+              "cnt": 1
+            },
+            {
+              "name": "nop",
+              "deps": [
+                {
+                  "tb": 1,
+                  "step": 3
+                }
+              ]
+            },
+            {
+              "name": "signal",
+              "o_buff": {
+                "src": "i",
+                "dst": "i"
+              },
+              "o_cids": [
+                {
+                  "id": 0,
+                  "off": 6
+                }
+              ],
+              "ctype": "sm",
+              "cnt": 1
+            },
+            {
+              "name": "wait",
+              "i_buff": {
+                "src": "i",
+                "dst": "i"
+              },
+              "i_cids": [
+                {
+                  "id": 0,
+                  "off": 4
+                }
+              ],
+              "ctype": "sm",
+              "cnt": 1
+            }
+          ],
+          "channels": [
+            {
+              "src": "i",
+              "dst": "i",
+              "ctype": "sm",
+              "cids": [
+                1
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
           "ops": [
             {
               "name": "signal",
@@ -438,7 +812,7 @@
               "name": "nop",
               "deps": [
                 {
-                  "tb": 1,
+                  "tb": 2,
                   "step": 1
                 }
               ]
@@ -478,7 +852,7 @@
               "name": "nop",
               "deps": [
                 {
-                  "tb": 1,
+                  "tb": 2,
                   "step": 3
                 }
               ]
@@ -520,7 +894,131 @@
               "dst": "i",
               "ctype": "sm",
               "cids": [
-                1
+                2
+              ]
+            }
+          ]
+        },
+        {
+          "id": 3,
+          "ops": [
+            {
+              "name": "signal",
+              "o_buff": {
+                "src": "i",
+                "dst": "i"
+              },
+              "o_cids": [
+                {
+                  "id": 0,
+                  "off": 5
+                }
+              ],
+              "ctype": "sm",
+              "cnt": 1
+            },
+            {
+              "name": "wait",
+              "i_buff": {
+                "src": "i",
+                "dst": "i"
+              },
+              "i_cids": [
+                {
+                  "id": 0,
+                  "off": 7
+                }
+              ],
+              "ctype": "sm",
+              "cnt": 1
+            },
+            {
+              "name": "nop",
+              "deps": [
+                {
+                  "tb": 3,
+                  "step": 1
+                }
+              ]
+            },
+            {
+              "name": "rrcs",
+              "i_buff": {
+                "src": "i",
+                "dst": "i"
+              },
+              "i_cids": [
+                {
+                  "id": 0,
+                  "off": 7
+                }
+              ],
+              "o_buff": {
+                "src": "i",
+                "dst": "i"
+              },
+              "o_cids": [
+                {
+                  "id": 0,
+                  "off": 7
+                }
+              ],
+              "src": 1,
+              "srcbuff": "i",
+              "srcoff": 7,
+              "dst": 1,
+              "dstbuff": "i",
+              "dstoff": 7,
+              "ctype": "sm",
+              "cnt": 1
+            },
+            {
+              "name": "nop",
+              "deps": [
+                {
+                  "tb": 3,
+                  "step": 3
+                }
+              ]
+            },
+            {
+              "name": "signal",
+              "o_buff": {
+                "src": "i",
+                "dst": "i"
+              },
+              "o_cids": [
+                {
+                  "id": 0,
+                  "off": 7
+                }
+              ],
+              "ctype": "sm",
+              "cnt": 1
+            },
+            {
+              "name": "wait",
+              "i_buff": {
+                "src": "i",
+                "dst": "i"
+              },
+              "i_cids": [
+                {
+                  "id": 0,
+                  "off": 5
+                }
+              ],
+              "ctype": "sm",
+              "cnt": 1
+            }
+          ],
+          "channels": [
+            {
+              "src": "i",
+              "dst": "i",
+              "ctype": "sm",
+              "cids": [
+                3
               ]
             }
           ]
@@ -532,6 +1030,8 @@
           "dstbuff": "i",
           "type": "sm",
           "connectedTo": [
+            0,
+            0,
             0,
             0
           ]

From 7e74ed8522c84e3c32166f83a198779e9c6c0abe Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Fri, 5 Apr 2024 06:34:38 +0000
Subject: [PATCH 26/51] for rocm

---
 src/CMakeLists.txt               |   2 +-
 src/executor/execution_kernel.cu | 258 --------------------------
 src/include/execution_common.hpp |  73 ++++++++
 src/include/execution_kernel.hpp | 298 +++++++++++++++++++++++++------
 src/include/execution_plan.hpp   |   2 +-
 5 files changed, 318 insertions(+), 315 deletions(-)
 delete mode 100644 src/executor/execution_kernel.cu
 create mode 100644 src/include/execution_common.hpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 45b4075d2..cfbcc927a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,6 +1,6 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc *.cu)
+file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc)
 target_sources(mscclpp_obj PRIVATE ${SOURCES})
 target_include_directories(mscclpp_obj PRIVATE include)
diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu
deleted file mode 100644
index 8a07870de..000000000
--- a/src/executor/execution_kernel.cu
+++ /dev/null
@@ -1,258 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <mscclpp/device.hpp>
-
-#include "execution_kernel.hpp"
-
-#if defined(MSCCLPP_DEVICE_HIP)
-#define __synclds() asm volatile("s_waitcnt lgkmcnt(0) \n s_barrier");
-#endif  // defined(MSCCLPP_DEVICE_HIP)
-
-namespace {
-template <typename To, typename From>
-MSCCLPP_DEVICE_INLINE To bit_cast(const From& src) {
-  static_assert(sizeof(To) == sizeof(From), "Size mismatch for bit_cast");
-
-  union {
-    From f;
-    To t;
-  } u;
-  u.f = src;
-  return u.t;
-}
-
-template <typename T>
-MSCCLPP_DEVICE_INLINE T add_elements(T a, T b) {
-  return a + b;
-}
-
-template <>
-MSCCLPP_DEVICE_INLINE __half2 add_elements(__half2 a, __half2 b) {
-  return __hadd2(a, b);
-}
-
-template <typename T>
-MSCCLPP_DEVICE_INLINE int4 add_vectors_helper(int4 a, int4 b) {
-  int4 ret;
-  ret.w = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.w), bit_cast<T, int>(b.w)));
-  ret.x = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.x), bit_cast<T, int>(b.x)));
-  ret.y = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.y), bit_cast<T, int>(b.y)));
-  ret.z = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.z), bit_cast<T, int>(b.z)));
-  return ret;
-}
-
-template <typename T>
-MSCCLPP_DEVICE_INLINE int4 add_vectors(int4 a, int4 b) {
-  return add_vectors_helper<T>(a, b);
-}
-
-template <>
-MSCCLPP_DEVICE_INLINE int4 add_vectors<__half>(int4 a, int4 b) {
-  return add_vectors_helper<__half2>(a, b);
-}
-
-template <typename T>
-MSCCLPP_DEVICE_INLINE uint2 add_vectors_helper(uint2 a, uint2 b) {
-  uint2 ret;
-  ret.x = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.x), bit_cast<T, int>(b.x)));
-  ret.y = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.y), bit_cast<T, int>(b.y)));
-  return ret;
-}
-
-template <typename T>
-MSCCLPP_DEVICE_INLINE uint2 add_vectors(uint2 a, uint2 b) {
-  return add_vectors_helper<T>(a, b);
-}
-
-template <>
-MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint2 add_vectors<__half>(uint2 a, uint2 b) {
-  return add_vectors_helper<__half2>(a, b);
-}
-
-template <typename T>
-MSCCLPP_DEVICE_INLINE int add_vectors_helper(int a, int b) {
-  return bit_cast<int, T>(add_elements(bit_cast<T, int>(a), bit_cast<T, int>(b)));
-}
-
-template <typename T>
-MSCCLPP_DEVICE_INLINE int add_vectors(int a, int b) {
-  return add_vectors_helper<T>(a, b);
-}
-
-template <>
-MSCCLPP_DEVICE_INLINE __attribute__((unused)) int add_vectors<__half>(int a, int b) {
-  return add_vectors_helper<__half2>(a, b);
-}
-
-template <typename T>
-MSCCLPP_DEVICE_INLINE uint32_t add_vectors_helper(uint32_t a, uint32_t b) {
-  return bit_cast<uint32_t, T>(add_elements(bit_cast<T, uint32_t>(a), bit_cast<T, uint32_t>(b)));
-}
-
-template <typename T>
-MSCCLPP_DEVICE_INLINE uint32_t add_vectors(uint32_t a, uint32_t b) {
-  return add_vectors_helper<T>(a, b);
-}
-
-template <>
-MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint32_t add_vectors<__half>(uint32_t a, uint32_t b) {
-  return add_vectors_helper<__half2>(a, b);
-}
-
-}  // namespace
-
-namespace mscclpp {
-
-template <typename T>
-MSCCLPP_DEVICE_INLINE T* getBuffer(T* input, T* output, T* scratch, BufferType bufferType) {
-  if (bufferType == BufferType::INPUT) {
-    return input;
-  }
-  if (bufferType == BufferType::OUTPUT) {
-    return output;
-  }
-  if (bufferType == BufferType::SCRATCH) {
-    return scratch;
-  }
-  return nullptr;
-}
-
-MSCCLPP_DEVICE_INLINE void handleSignal(int tid, DeviceHandle<SmChannel>* smChannels,
-                                        DeviceHandle<SimpleProxyChannel>* proxyChannels, uint8_t* channelIndex,
-                                        int nChannels, ChannelType chType) {
-  if (tid < nChannels) {
-    if (chType == ChannelType::SM) {
-      smChannels[channelIndex[tid]].signal();
-    }
-    if (chType == ChannelType::PROXY) {
-      proxyChannels[channelIndex[tid]].signal();
-    }
-  }
-}
-
-MSCCLPP_DEVICE_INLINE void handleWait(int tid, DeviceHandle<SmChannel>* smChannels,
-                                      DeviceHandle<SimpleProxyChannel>* proxyChannels, uint8_t* channelIndex,
-                                      int nChannels, ChannelType chType) {
-  if (tid < nChannels) {
-    if (chType == ChannelType::SM) {
-      smChannels[channelIndex[tid]].wait();
-    }
-    if (chType == ChannelType::PROXY) {
-      proxyChannels[channelIndex[tid]].wait();
-    }
-  }
-}
-
-template <typename T>
-MSCCLPP_DEVICE_INLINE void handleReadReduceCopySend(T* input, uint32_t inputOffsetByBytes, T* output,
-                                                    uint32_t outputOffsetByBytes, DeviceHandle<SmChannel>* smChannels,
-                                                    uint8_t* srcChannelIndex, uint8_t* dstChannelIndex,
-                                                    uint32_t* srcOffsets, uint32_t* dstOffsets, int nSrcChannels,
-                                                    int nDstChannels, uint32_t size) {
-  const size_t nInt4 = size / sizeof(int4);
-  const size_t inputOffset4 = inputOffsetByBytes / sizeof(int4);
-  const size_t outputOffset4 = outputOffsetByBytes / sizeof(int4);
-  int4* input4 = (int4*)input;
-  int4* output4 = (int4*)output;
-  for (size_t idx = threadIdx.x; idx < nInt4; idx += blockDim.x) {
-    int4 tmp = input4[inputOffset4 + idx];
-    for (int index = 0; index < nSrcChannels; ++index) {
-      int4 val;
-      size_t srcOffset = srcOffsets[index] / sizeof(int4);
-      val = smChannels[srcChannelIndex[index]].read<int4>(srcOffset + idx);
-      tmp = add_vectors<T>(tmp, val);
-    }
-    output4[outputOffset4 + idx] = tmp;
-    for (int index = 0; index < nDstChannels; ++index) {
-      size_t dstOffset = dstOffsets[index] / sizeof(int4);
-      smChannels[dstChannelIndex[index]].write<int4>(dstOffset + idx, tmp);
-    }
-  }
-  // handle rest of data
-  size_t processed = nInt4 * sizeof(int4);
-  const size_t startIdx = (inputOffsetByBytes + processed) / sizeof(T);
-  const size_t endIdx = (inputOffsetByBytes + size) / sizeof(T);
-  for (size_t idx = threadIdx.x + startIdx; idx < endIdx; idx += blockDim.x) {
-    T tmp = input[idx];
-    for (int index = 0; index < nSrcChannels; ++index) {
-      size_t srcOffset = srcOffsets[index] / sizeof(T);
-      tmp += smChannels[srcChannelIndex[index]].read<T>(srcOffset + idx);
-    }
-    output[idx] = tmp;
-    for (int index = 0; index < nDstChannels; ++index) {
-      size_t dstOffset = dstOffsets[index] / sizeof(T);
-      smChannels[dstChannelIndex[index]].write<T>(dstOffset + idx, tmp);
-    }
-  }
-}
-
-template <typename T>
-__global__ void kernel(int rank, T* input, T* output, T* scratch, DeviceExecutionPlan* plan) {
-  extern __shared__ int sharedMem[];
-  int bid = blockIdx.x;
-  int tid = threadIdx.x;
-  DeviceExecutionPlan* localPlan = plan + bid;
-  for (int i = tid; i < sizeof(DeviceExecutionPlan) / sizeof(int); i += blockDim.x) {
-    sharedMem[i] = ((int*)localPlan)[i];
-  }
-#if defined(MSCCLPP_DEVICE_HIP)
-  __synclds();
-#else   // !defined(MSCCLPP_DEVICE_HIP)
-  __syncthreads();
-#endif  // !defined(MSCCLPP_DEVICE_HIP)
-  Operation* operations = localPlan->operations;
-  DeviceHandle<SmChannel>* smChannels = localPlan->channels.smChannels;
-  DeviceHandle<SimpleProxyChannel>* proxyChannels = localPlan->channels.proxyChannels;
-  T* src = nullptr;
-  T* dst = nullptr;
-  for (int i = 0; i < localPlan->nOperations; i++) {
-    switch (operations[i].type) {
-      case OperationType::BARRIER:
-        __syncthreads();
-        break;
-      case OperationType::SIGNAL:
-        handleSignal(tid, smChannels, proxyChannels, operations[i].outputChannelIndexes, operations[i].nOutputChannels,
-                     operations[i].channelType);
-        break;
-      case OperationType::WAIT:
-        handleWait(tid, smChannels, proxyChannels, operations[i].inputChannelIndexes, operations[i].nInputChannels,
-                   operations[i].channelType);
-        break;
-      case OperationType::READ_REDUCE_COPY_SEND:
-        src = getBuffer(input, output, scratch, operations[i].srcBufferType);
-        dst = getBuffer(input, output, scratch, operations[i].dstBufferType);
-        handleReadReduceCopySend(src, operations[i].srcOffset, dst, operations[i].dstOffset, smChannels,
-                                 operations[i].inputChannelIndexes, operations[i].outputChannelIndexes,
-                                 operations[i].inputOffsets, operations[i].outputOffsets, operations[i].nInputChannels,
-                                 operations[i].nOutputChannels, operations[i].size);
-        break;
-      default:
-        break;
-    }
-  }
-}
-
-void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch,
-                                   DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize,
-                                   cudaStream_t stream) {
-  switch (dataType) {
-    case DataType::INT32:
-      kernel<int32_t><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (int32_t*)src, (int32_t*)dst,
-                                                                          (int32_t*)scratch, plan);
-      break;
-    case DataType::UINT32:
-      kernel<uint32_t><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (uint32_t*)src, (uint32_t*)dst,
-                                                                           (uint32_t*)scratch, plan);
-      break;
-    case DataType::FLOAT16:
-      kernel<half>
-          <<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (half*)src, (half*)dst, (half*)scratch, plan);
-      break;
-    case DataType::FLOAT32:
-      kernel<float>
-          <<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (float*)src, (float*)dst, (float*)scratch, plan);
-      break;
-  }
-}
-}  // namespace mscclpp
diff --git a/src/include/execution_common.hpp b/src/include/execution_common.hpp
new file mode 100644
index 000000000..59d341612
--- /dev/null
+++ b/src/include/execution_common.hpp
@@ -0,0 +1,73 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef MSCCLPP_EXECUTION_COMMON_HPP_
+#define MSCCLPP_EXECUTION_COMMON_HPP_
+
+#include <mscclpp/proxy_channel.hpp>
+#include <mscclpp/sm_channel.hpp>
+
+namespace mscclpp {
+
+constexpr int MAX_CHANNEL = 16;
+constexpr int MAX_CHANNEL_PER_OPERATION = 8;
+constexpr int MAX_OPERATION = 64;
+
+enum class BufferType : uint8_t {
+  INPUT,
+  OUTPUT,
+  SCRATCH,
+};
+
+enum class ChannelType : uint8_t {
+  SM,
+  PROXY,
+};
+
+enum class OperationType : uint8_t {
+  BARRIER,
+  PUT,
+  GET,
+  COPY,
+  SIGNAL,
+  WAIT,
+  FLUSH,
+  REDUCE,
+  REDUCE_SEND,
+  READ_REDUCE_COPY,
+  READ_REDUCE_COPY_SEND,
+};
+
+struct Channels {
+  mscclpp::DeviceHandle<mscclpp::SmChannel> smChannels[MAX_CHANNEL];
+  mscclpp::DeviceHandle<mscclpp::SimpleProxyChannel> proxyChannels[MAX_CHANNEL];
+};
+
+struct Operation {
+  OperationType type;
+  ChannelType channelType;
+  BufferType srcBufferType;
+  BufferType dstBufferType;
+  uint8_t nInputChannels;
+  uint8_t nOutputChannels;
+  uint8_t inputChannelIndexes[MAX_CHANNEL_PER_OPERATION];
+  uint8_t outputChannelIndexes[MAX_CHANNEL_PER_OPERATION];
+  uint32_t inputOffsets[MAX_CHANNEL_PER_OPERATION];
+  uint32_t outputOffsets[MAX_CHANNEL_PER_OPERATION];
+  uint32_t srcOffset;
+  uint32_t dstOffset;
+  uint32_t size;
+};
+
+// total size = 1920 + 6400 + 4 + 4(padding) = 8324 bytes
+struct DeviceExecutionPlan {
+  uint8_t nSmChannels;                  // 1 bytes
+  uint8_t nProxyChannels;               // 1 bytes
+  uint16_t nOperations;                 // 2 bytes
+  Channels channels;                    // 1920 bytes
+  Operation operations[MAX_OPERATION];  // 64 * 100 = 6400 bytes
+};
+
+}  // namespace mscclpp
+
+#endif  // MSCCLPP_EXECUTION_COMMON_HPP_
diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp
index f1934b567..71c1e140a 100644
--- a/src/include/execution_kernel.hpp
+++ b/src/include/execution_kernel.hpp
@@ -8,73 +8,261 @@
 #include <mscclpp/proxy_channel.hpp>
 #include <mscclpp/sm_channel.hpp>
 
-namespace mscclpp {
+#include "execution_common.hpp"
 
-constexpr int MAX_CHANNEL = 16;
-constexpr int MAX_CHANNEL_PER_OPERATION = 8;
-constexpr int MAX_OPERATION = 64;
+#if defined(MSCCLPP_DEVICE_HIP)
+#define __synclds() asm volatile("s_waitcnt lgkmcnt(0) \n s_barrier");
+#endif  // defined(MSCCLPP_DEVICE_HIP)
 
-enum class BufferType : uint8_t {
-  INPUT,
-  OUTPUT,
-  SCRATCH,
-};
+namespace {
+template <typename To, typename From>
+MSCCLPP_DEVICE_INLINE To bit_cast(const From& src) {
+  static_assert(sizeof(To) == sizeof(From), "Size mismatch for bit_cast");
 
-enum class ChannelType : uint8_t {
-  SM,
-  PROXY,
-};
+  union {
+    From f;
+    To t;
+  } u;
+  u.f = src;
+  return u.t;
+}
 
-enum class OperationType : uint8_t {
-  BARRIER,
-  PUT,
-  GET,
-  COPY,
-  SIGNAL,
-  WAIT,
-  FLUSH,
-  REDUCE,
-  REDUCE_SEND,
-  READ_REDUCE_COPY,
-  READ_REDUCE_COPY_SEND,
-};
+template <typename T>
+MSCCLPP_DEVICE_INLINE T add_elements(T a, T b) {
+  return a + b;
+}
 
-struct Channels {
-  mscclpp::DeviceHandle<mscclpp::SmChannel> smChannels[MAX_CHANNEL];
-  mscclpp::DeviceHandle<mscclpp::SimpleProxyChannel> proxyChannels[MAX_CHANNEL];
-};
+template <>
+MSCCLPP_DEVICE_INLINE __half2 add_elements(__half2 a, __half2 b) {
+  return __hadd2(a, b);
+}
 
-struct Operation {
-  OperationType type;
-  ChannelType channelType;
-  BufferType srcBufferType;
-  BufferType dstBufferType;
-  uint8_t nInputChannels;
-  uint8_t nOutputChannels;
-  uint8_t inputChannelIndexes[MAX_CHANNEL_PER_OPERATION];
-  uint8_t outputChannelIndexes[MAX_CHANNEL_PER_OPERATION];
-  uint32_t inputOffsets[MAX_CHANNEL_PER_OPERATION];
-  uint32_t outputOffsets[MAX_CHANNEL_PER_OPERATION];
-  uint32_t srcOffset;
-  uint32_t dstOffset;
-  uint32_t size;
-};
+template <typename T>
+MSCCLPP_DEVICE_INLINE int4 add_vectors_helper(int4 a, int4 b) {
+  int4 ret;
+  ret.w = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.w), bit_cast<T, int>(b.w)));
+  ret.x = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.x), bit_cast<T, int>(b.x)));
+  ret.y = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.y), bit_cast<T, int>(b.y)));
+  ret.z = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.z), bit_cast<T, int>(b.z)));
+  return ret;
+}
 
-// total size = 1920 + 6400 + 4 + 4(padding) = 8324 bytes
-struct DeviceExecutionPlan {
-  uint8_t nSmChannels;                  // 1 bytes
-  uint8_t nProxyChannels;               // 1 bytes
-  uint16_t nOperations;                 // 2 bytes
-  Channels channels;                    // 1920 bytes
-  Operation operations[MAX_OPERATION];  // 64 * 100 = 6400 bytes
-};
+template <typename T>
+MSCCLPP_DEVICE_INLINE int4 add_vectors(int4 a, int4 b) {
+  return add_vectors_helper<T>(a, b);
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE int4 add_vectors<__half>(int4 a, int4 b) {
+  return add_vectors_helper<__half2>(a, b);
+}
+
+template <typename T>
+MSCCLPP_DEVICE_INLINE uint2 add_vectors_helper(uint2 a, uint2 b) {
+  uint2 ret;
+  ret.x = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.x), bit_cast<T, int>(b.x)));
+  ret.y = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.y), bit_cast<T, int>(b.y)));
+  return ret;
+}
+
+template <typename T>
+MSCCLPP_DEVICE_INLINE uint2 add_vectors(uint2 a, uint2 b) {
+  return add_vectors_helper<T>(a, b);
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint2 add_vectors<__half>(uint2 a, uint2 b) {
+  return add_vectors_helper<__half2>(a, b);
+}
+
+template <typename T>
+MSCCLPP_DEVICE_INLINE int add_vectors_helper(int a, int b) {
+  return bit_cast<int, T>(add_elements(bit_cast<T, int>(a), bit_cast<T, int>(b)));
+}
+
+template <typename T>
+MSCCLPP_DEVICE_INLINE int add_vectors(int a, int b) {
+  return add_vectors_helper<T>(a, b);
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE __attribute__((unused)) int add_vectors<__half>(int a, int b) {
+  return add_vectors_helper<__half2>(a, b);
+}
+
+template <typename T>
+MSCCLPP_DEVICE_INLINE uint32_t add_vectors_helper(uint32_t a, uint32_t b) {
+  return bit_cast<uint32_t, T>(add_elements(bit_cast<T, uint32_t>(a), bit_cast<T, uint32_t>(b)));
+}
+
+template <typename T>
+MSCCLPP_DEVICE_INLINE uint32_t add_vectors(uint32_t a, uint32_t b) {
+  return add_vectors_helper<T>(a, b);
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint32_t add_vectors<__half>(uint32_t a, uint32_t b) {
+  return add_vectors_helper<__half2>(a, b);
+}
+
+}  // namespace
+
+namespace mscclpp {
+
+template <typename T>
+MSCCLPP_DEVICE_INLINE T* getBuffer(T* input, T* output, T* scratch, BufferType bufferType) {
+  if (bufferType == BufferType::INPUT) {
+    return input;
+  }
+  if (bufferType == BufferType::OUTPUT) {
+    return output;
+  }
+  if (bufferType == BufferType::SCRATCH) {
+    return scratch;
+  }
+  return nullptr;
+}
+
+MSCCLPP_DEVICE_INLINE void handleSignal(int tid, DeviceHandle<SmChannel>* smChannels,
+                                        DeviceHandle<SimpleProxyChannel>* proxyChannels, uint8_t* channelIndex,
+                                        int nChannels, ChannelType chType) {
+  if (tid < nChannels) {
+    if (chType == ChannelType::SM) {
+      smChannels[channelIndex[tid]].signal();
+    }
+    if (chType == ChannelType::PROXY) {
+      proxyChannels[channelIndex[tid]].signal();
+    }
+  }
+}
+
+MSCCLPP_DEVICE_INLINE void handleWait(int tid, DeviceHandle<SmChannel>* smChannels,
+                                      DeviceHandle<SimpleProxyChannel>* proxyChannels, uint8_t* channelIndex,
+                                      int nChannels, ChannelType chType) {
+  if (tid < nChannels) {
+    if (chType == ChannelType::SM) {
+      smChannels[channelIndex[tid]].wait();
+    }
+    if (chType == ChannelType::PROXY) {
+      proxyChannels[channelIndex[tid]].wait();
+    }
+  }
+}
+
+template <typename T>
+MSCCLPP_DEVICE_INLINE void handleReadReduceCopySend(T* input, uint32_t inputOffsetByBytes, T* output,
+                                                    uint32_t outputOffsetByBytes, DeviceHandle<SmChannel>* smChannels,
+                                                    uint8_t* srcChannelIndex, uint8_t* dstChannelIndex,
+                                                    uint32_t* srcOffsets, uint32_t* dstOffsets, int nSrcChannels,
+                                                    int nDstChannels, uint32_t size) {
+  const size_t nInt4 = size / sizeof(int4);
+  const size_t inputOffset4 = inputOffsetByBytes / sizeof(int4);
+  const size_t outputOffset4 = outputOffsetByBytes / sizeof(int4);
+  int4* input4 = (int4*)input;
+  int4* output4 = (int4*)output;
+  for (size_t idx = threadIdx.x; idx < nInt4; idx += blockDim.x) {
+    int4 tmp = input4[inputOffset4 + idx];
+    for (int index = 0; index < nSrcChannels; ++index) {
+      int4 val;
+      size_t srcOffset = srcOffsets[index] / sizeof(int4);
+      val = smChannels[srcChannelIndex[index]].read<int4>(srcOffset + idx);
+      tmp = add_vectors<T>(tmp, val);
+    }
+    output4[outputOffset4 + idx] = tmp;
+    for (int index = 0; index < nDstChannels; ++index) {
+      size_t dstOffset = dstOffsets[index] / sizeof(int4);
+      smChannels[dstChannelIndex[index]].write<int4>(dstOffset + idx, tmp);
+    }
+  }
+  // handle rest of data
+  size_t processed = nInt4 * sizeof(int4);
+  const size_t startIdx = (inputOffsetByBytes + processed) / sizeof(T);
+  const size_t endIdx = (inputOffsetByBytes + size) / sizeof(T);
+  for (size_t idx = threadIdx.x + startIdx; idx < endIdx; idx += blockDim.x) {
+    T tmp = input[idx];
+    for (int index = 0; index < nSrcChannels; ++index) {
+      size_t srcOffset = srcOffsets[index] / sizeof(T);
+      tmp += smChannels[srcChannelIndex[index]].read<T>(srcOffset + idx);
+    }
+    output[idx] = tmp;
+    for (int index = 0; index < nDstChannels; ++index) {
+      size_t dstOffset = dstOffsets[index] / sizeof(T);
+      smChannels[dstChannelIndex[index]].write<T>(dstOffset + idx, tmp);
+    }
+  }
+}
+
+template <typename T>
+__global__ void kernel([[maybe_unused]] int rank /*for debug*/, T* input, T* output, T* scratch,
+                       DeviceExecutionPlan* plan) {
+  extern __shared__ int sharedMem[];
+  int bid = blockIdx.x;
+  int tid = threadIdx.x;
+  DeviceExecutionPlan* localPlan = plan + bid;
+  for (size_t i = tid; i < sizeof(DeviceExecutionPlan) / sizeof(int); i += blockDim.x) {
+    sharedMem[i] = ((int*)localPlan)[i];
+  }
+#if defined(MSCCLPP_DEVICE_HIP)
+  __synclds();
+#else   // !defined(MSCCLPP_DEVICE_HIP)
+  __syncthreads();
+#endif  // !defined(MSCCLPP_DEVICE_HIP)
+  Operation* operations = localPlan->operations;
+  DeviceHandle<SmChannel>* smChannels = localPlan->channels.smChannels;
+  DeviceHandle<SimpleProxyChannel>* proxyChannels = localPlan->channels.proxyChannels;
+  T* src = nullptr;
+  T* dst = nullptr;
+  for (int i = 0; i < localPlan->nOperations; i++) {
+    switch (operations[i].type) {
+      case OperationType::BARRIER:
+        __syncthreads();
+        break;
+      case OperationType::SIGNAL:
+        handleSignal(tid, smChannels, proxyChannels, operations[i].outputChannelIndexes, operations[i].nOutputChannels,
+                     operations[i].channelType);
+        break;
+      case OperationType::WAIT:
+        handleWait(tid, smChannels, proxyChannels, operations[i].inputChannelIndexes, operations[i].nInputChannels,
+                   operations[i].channelType);
+        break;
+      case OperationType::READ_REDUCE_COPY_SEND:
+        src = getBuffer(input, output, scratch, operations[i].srcBufferType);
+        dst = getBuffer(input, output, scratch, operations[i].dstBufferType);
+        handleReadReduceCopySend(src, operations[i].srcOffset, dst, operations[i].dstOffset, smChannels,
+                                 operations[i].inputChannelIndexes, operations[i].outputChannelIndexes,
+                                 operations[i].inputOffsets, operations[i].outputOffsets, operations[i].nInputChannels,
+                                 operations[i].nOutputChannels, operations[i].size);
+        break;
+      default:
+        break;
+    }
+  }
+}
 
 class ExecutionKernel {
  public:
   static void launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch,
-                           DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, cudaStream_t stream);
+                           DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, cudaStream_t stream) {
+    switch (dataType) {
+      case DataType::INT32:
+        kernel<int32_t><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (int32_t*)src, (int32_t*)dst,
+                                                                            (int32_t*)scratch, plan);
+        break;
+      case DataType::UINT32:
+        kernel<uint32_t><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (uint32_t*)src, (uint32_t*)dst,
+                                                                             (uint32_t*)scratch, plan);
+        break;
+      case DataType::FLOAT16:
+        kernel<half>
+            <<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (half*)src, (half*)dst, (half*)scratch, plan);
+        break;
+      case DataType::FLOAT32:
+        kernel<float>
+            <<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (float*)src, (float*)dst, (float*)scratch, plan);
+        break;
+    }
+  }
 };
-
 }  // namespace mscclpp
 
 #endif  // MSCCLPP_EXECUTION_KERNEL_HPP_
diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp
index 3575390ba..6a4aaa80a 100644
--- a/src/include/execution_plan.hpp
+++ b/src/include/execution_plan.hpp
@@ -10,7 +10,7 @@
 #include <string>
 #include <unordered_map>
 
-#include "execution_kernel.hpp"
+#include "execution_common.hpp"
 
 namespace mscclpp {
 

From 7745c873d0fcb3d7ed7a96ebb45abd174ed55585 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Fri, 5 Apr 2024 06:43:39 +0000
Subject: [PATCH 27/51] fix build

---
 src/executor/execution_kernel.cu | 34 ++++++++++++++++++++++++++++++++
 src/include/execution_kernel.hpp | 21 ++++++++++++--------
 2 files changed, 47 insertions(+), 8 deletions(-)
 create mode 100644 src/executor/execution_kernel.cu

diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu
new file mode 100644
index 000000000..a2e37d9ef
--- /dev/null
+++ b/src/executor/execution_kernel.cu
@@ -0,0 +1,34 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include <mscclpp/device.hpp>
+
+#include "execution_kernel.hpp"
+
+namespace mscclpp {
+
+#if !defined(MSCCLPP_DEVICE_HIP)
+void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch,
+                                   DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize,
+                                   cudaStream_t stream) {
+  switch (dataType) {
+    case DataType::INT32:
+      kernel<int32_t><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (int32_t*)src, (int32_t*)dst,
+                                                                          (int32_t*)scratch, plan);
+      break;
+    case DataType::UINT32:
+      kernel<uint32_t><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (uint32_t*)src, (uint32_t*)dst,
+                                                                           (uint32_t*)scratch, plan);
+      break;
+    case DataType::FLOAT16:
+      kernel<half>
+          <<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (half*)src, (half*)dst, (half*)scratch, plan);
+      break;
+    case DataType::FLOAT32:
+      kernel<float>
+          <<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (float*)src, (float*)dst, (float*)scratch, plan);
+      break;
+  }
+}
+#endif  // !defined(MSCCLPP_DEVICE_HIP)
+}  // namespace mscclpp
diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp
index 71c1e140a..ef19f30c2 100644
--- a/src/include/execution_kernel.hpp
+++ b/src/include/execution_kernel.hpp
@@ -193,8 +193,8 @@ MSCCLPP_DEVICE_INLINE void handleReadReduceCopySend(T* input, uint32_t inputOffs
 }
 
 template <typename T>
-__global__ void kernel([[maybe_unused]] int rank /*for debug*/, T* input, T* output, T* scratch,
-                       DeviceExecutionPlan* plan) {
+__global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* input, T* output, T* scratch,
+                                DeviceExecutionPlan* plan) {
   extern __shared__ int sharedMem[];
   int bid = blockIdx.x;
   int tid = threadIdx.x;
@@ -241,28 +241,33 @@ __global__ void kernel([[maybe_unused]] int rank /*for debug*/, T* input, T* out
 
 class ExecutionKernel {
  public:
+#if defined(MSCCLPP_DEVICE_HIP)
   static void launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch,
                            DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, cudaStream_t stream) {
     switch (dataType) {
       case DataType::INT32:
-        kernel<int32_t><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (int32_t*)src, (int32_t*)dst,
-                                                                            (int32_t*)scratch, plan);
+        executionKernel<int32_t><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (int32_t*)src, (int32_t*)dst,
+                                                                                     (int32_t*)scratch, plan);
         break;
       case DataType::UINT32:
-        kernel<uint32_t><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (uint32_t*)src, (uint32_t*)dst,
-                                                                             (uint32_t*)scratch, plan);
+        executionKernel<uint32_t><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
+            rank, (uint32_t*)src, (uint32_t*)dst, (uint32_t*)scratch, plan);
         break;
       case DataType::FLOAT16:
-        kernel<half>
+        executionKernel<half>
             <<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (half*)src, (half*)dst, (half*)scratch, plan);
         break;
       case DataType::FLOAT32:
-        kernel<float>
+        executionKernel<float>
             <<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (float*)src, (float*)dst, (float*)scratch, plan);
         break;
     }
   }
 };
+#else   // !defined(MSCCLPP_DEVICE_HIP)
+  static void launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch,
+                           DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, cudaStream_t stream);
+#endif  // defined(MSCCLPP_DEVICE_HIP)
 }  // namespace mscclpp
 
 #endif  // MSCCLPP_EXECUTION_KERNEL_HPP_

From bbf197d508bc6201ba31fcc1dcdaeac1d5d5a56b Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Fri, 5 Apr 2024 07:01:13 +0000
Subject: [PATCH 28/51] update

---
 src/executor/execution_kernel.cu | 34 --------------------------------
 src/include/execution_kernel.hpp | 13 +++++++-----
 2 files changed, 8 insertions(+), 39 deletions(-)
 delete mode 100644 src/executor/execution_kernel.cu

diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu
deleted file mode 100644
index a2e37d9ef..000000000
--- a/src/executor/execution_kernel.cu
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <mscclpp/device.hpp>
-
-#include "execution_kernel.hpp"
-
-namespace mscclpp {
-
-#if !defined(MSCCLPP_DEVICE_HIP)
-void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch,
-                                   DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize,
-                                   cudaStream_t stream) {
-  switch (dataType) {
-    case DataType::INT32:
-      kernel<int32_t><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (int32_t*)src, (int32_t*)dst,
-                                                                          (int32_t*)scratch, plan);
-      break;
-    case DataType::UINT32:
-      kernel<uint32_t><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (uint32_t*)src, (uint32_t*)dst,
-                                                                           (uint32_t*)scratch, plan);
-      break;
-    case DataType::FLOAT16:
-      kernel<half>
-          <<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (half*)src, (half*)dst, (half*)scratch, plan);
-      break;
-    case DataType::FLOAT32:
-      kernel<float>
-          <<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (float*)src, (float*)dst, (float*)scratch, plan);
-      break;
-  }
-}
-#endif  // !defined(MSCCLPP_DEVICE_HIP)
-}  // namespace mscclpp
diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp
index ef19f30c2..e3d110d94 100644
--- a/src/include/execution_kernel.hpp
+++ b/src/include/execution_kernel.hpp
@@ -10,6 +10,7 @@
 
 #include "execution_common.hpp"
 
+#if defined(MSCCLPP_DEVICE_COMPILE)
 #if defined(MSCCLPP_DEVICE_HIP)
 #define __synclds() asm volatile("s_waitcnt lgkmcnt(0) \n s_barrier");
 #endif  // defined(MSCCLPP_DEVICE_HIP)
@@ -106,9 +107,11 @@ MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint32_t add_vectors<__half>(uint3
 }
 
 }  // namespace
+#endif  // defined(MSCCLPP_DEVICE_COMPILE)
 
 namespace mscclpp {
 
+#if defined(MSCCLPP_DEVICE_COMPILE)
 template <typename T>
 MSCCLPP_DEVICE_INLINE T* getBuffer(T* input, T* output, T* scratch, BufferType bufferType) {
   if (bufferType == BufferType::INPUT) {
@@ -238,10 +241,11 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu
     }
   }
 }
+#endif  // defined(MSCCLPP_DEVICE_COMPILE)
 
 class ExecutionKernel {
  public:
-#if defined(MSCCLPP_DEVICE_HIP)
+#if defined(MSCCLPP_DEVICE_COMPILE)
   static void launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch,
                            DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, cudaStream_t stream) {
     switch (dataType) {
@@ -263,11 +267,10 @@ class ExecutionKernel {
         break;
     }
   }
+#else   // !defined(MSCCLPP_DEVICE_COMPILE)
+  static void launchKernel(int, int, int, void*, void*, void*, DataType, DeviceExecutionPlan*, size_t, cudaStream_t) {}
+#endif  // !defined(MSCCLPP_DEVICE_COMPILE)
 };
-#else   // !defined(MSCCLPP_DEVICE_HIP)
-  static void launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch,
-                           DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, cudaStream_t stream);
-#endif  // defined(MSCCLPP_DEVICE_HIP)
 }  // namespace mscclpp
 
 #endif  // MSCCLPP_EXECUTION_KERNEL_HPP_

From d38c9edff7383e8c34c9764f0c0e15ad0f89955d Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Fri, 5 Apr 2024 11:20:03 +0000
Subject: [PATCH 29/51] build fix

---
 src/CMakeLists.txt               |  2 +-
 src/executor/execution_kernel.cu | 29 +++++++++++++++++++++++++++++
 src/include/execution_kernel.hpp |  9 +++++----
 3 files changed, 35 insertions(+), 5 deletions(-)
 create mode 100644 src/executor/execution_kernel.cu

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index cfbcc927a..45b4075d2 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,6 +1,6 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc)
+file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc *.cu)
 target_sources(mscclpp_obj PRIVATE ${SOURCES})
 target_include_directories(mscclpp_obj PRIVATE include)
diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu
new file mode 100644
index 000000000..f5a24ff0f
--- /dev/null
+++ b/src/executor/execution_kernel.cu
@@ -0,0 +1,29 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "execution_kernel.hpp"
+
+namespace mscclpp {
+void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch,
+                                   DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize,
+                                   cudaStream_t stream) {
+  switch (dataType) {
+    case DataType::INT32:
+      executionKernel<int32_t><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (int32_t*)src, (int32_t*)dst,
+                                                                                   (int32_t*)scratch, plan);
+      break;
+    case DataType::UINT32:
+      executionKernel<uint32_t><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
+          rank, (uint32_t*)src, (uint32_t*)dst, (uint32_t*)scratch, plan);
+      break;
+    case DataType::FLOAT16:
+      executionKernel<half>
+          <<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (half*)src, (half*)dst, (half*)scratch, plan);
+      break;
+    case DataType::FLOAT32:
+      executionKernel<float>
+          <<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (float*)src, (float*)dst, (float*)scratch, plan);
+      break;
+  }
+}
+}  // namespace mscclpp
diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp
index e3d110d94..6ac592eb6 100644
--- a/src/include/execution_kernel.hpp
+++ b/src/include/execution_kernel.hpp
@@ -245,7 +245,7 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu
 
 class ExecutionKernel {
  public:
-#if defined(MSCCLPP_DEVICE_COMPILE)
+#if defined(MSCCLPP_DEVICE_HIP)
   static void launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch,
                            DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, cudaStream_t stream) {
     switch (dataType) {
@@ -267,9 +267,10 @@ class ExecutionKernel {
         break;
     }
   }
-#else   // !defined(MSCCLPP_DEVICE_COMPILE)
-  static void launchKernel(int, int, int, void*, void*, void*, DataType, DeviceExecutionPlan*, size_t, cudaStream_t) {}
-#endif  // !defined(MSCCLPP_DEVICE_COMPILE)
+#else   // !defined(MSCCLPP_DEVICE_HIP)
+  static void launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch,
+                           DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, cudaStream_t stream);
+#endif  // !defined(MSCCLPP_DEVICE_HIP)
 };
 }  // namespace mscclpp
 

From 867101e9aec63d68b72bd1d658b134c4a3e9cc74 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Fri, 5 Apr 2024 11:28:50 +0000
Subject: [PATCH 30/51] minor update

---
 src/executor/execution_kernel.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu
index f5a24ff0f..d5e07a3da 100644
--- a/src/executor/execution_kernel.cu
+++ b/src/executor/execution_kernel.cu
@@ -3,6 +3,7 @@
 
 #include "execution_kernel.hpp"
 
+#if defined(MSCCLPP_DEVICE_CUDA)
 namespace mscclpp {
 void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch,
                                    DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize,
@@ -27,3 +28,4 @@ void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, vo
   }
 }
 }  // namespace mscclpp
+#endif

From 6049e9e44194236a560813e179cfeca9fd337867 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Fri, 5 Apr 2024 12:23:51 +0000
Subject: [PATCH 31/51] more ops

---
 src/include/execution_kernel.hpp | 47 +++++++++++++++++++++++---------
 1 file changed, 34 insertions(+), 13 deletions(-)

diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp
index 6ac592eb6..833130fc9 100644
--- a/src/include/execution_kernel.hpp
+++ b/src/include/execution_kernel.hpp
@@ -140,24 +140,29 @@ MSCCLPP_DEVICE_INLINE void handleSignal(int tid, DeviceHandle<SmChannel>* smChan
 }
 
 MSCCLPP_DEVICE_INLINE void handleWait(int tid, DeviceHandle<SmChannel>* smChannels,
-                                      DeviceHandle<SimpleProxyChannel>* proxyChannels, uint8_t* channelIndex,
+                                      DeviceHandle<SimpleProxyChannel>* proxyChannels, uint8_t* channelIndexes,
                                       int nChannels, ChannelType chType) {
   if (tid < nChannels) {
     if (chType == ChannelType::SM) {
-      smChannels[channelIndex[tid]].wait();
+      smChannels[channelIndexes[tid]].wait();
     }
     if (chType == ChannelType::PROXY) {
-      proxyChannels[channelIndex[tid]].wait();
+      proxyChannels[channelIndexes[tid]].wait();
     }
   }
 }
 
+MSCCLPP_DEVICE_INLINE void handleGet(DeviceHandle<SmChannel>& smChannel, uint32_t srcOffset, uint32_t dstOffset,
+                                     uint32_t size) {
+  smChannel.get(dstOffset, srcOffset, size, threadIdx.x, blockDim.x);
+}
+
 template <typename T>
 MSCCLPP_DEVICE_INLINE void handleReadReduceCopySend(T* input, uint32_t inputOffsetByBytes, T* output,
                                                     uint32_t outputOffsetByBytes, DeviceHandle<SmChannel>* smChannels,
-                                                    uint8_t* srcChannelIndex, uint8_t* dstChannelIndex,
+                                                    uint8_t* srcChannelIndexes, uint8_t* dstChannelIndexes,
                                                     uint32_t* srcOffsets, uint32_t* dstOffsets, int nSrcChannels,
-                                                    int nDstChannels, uint32_t size) {
+                                                    int nDstChannels, uint32_t size, bool sendToRemote = true) {
   const size_t nInt4 = size / sizeof(int4);
   const size_t inputOffset4 = inputOffsetByBytes / sizeof(int4);
   const size_t outputOffset4 = outputOffsetByBytes / sizeof(int4);
@@ -168,13 +173,15 @@ MSCCLPP_DEVICE_INLINE void handleReadReduceCopySend(T* input, uint32_t inputOffs
     for (int index = 0; index < nSrcChannels; ++index) {
       int4 val;
       size_t srcOffset = srcOffsets[index] / sizeof(int4);
-      val = smChannels[srcChannelIndex[index]].read<int4>(srcOffset + idx);
+      val = smChannels[srcChannelIndexes[index]].read<int4>(srcOffset + idx);
       tmp = add_vectors<T>(tmp, val);
     }
     output4[outputOffset4 + idx] = tmp;
-    for (int index = 0; index < nDstChannels; ++index) {
-      size_t dstOffset = dstOffsets[index] / sizeof(int4);
-      smChannels[dstChannelIndex[index]].write<int4>(dstOffset + idx, tmp);
+    if (sendToRemote) {
+      for (int index = 0; index < nDstChannels; ++index) {
+        size_t dstOffset = dstOffsets[index] / sizeof(int4);
+        smChannels[dstChannelIndexes[index]].write<int4>(dstOffset + idx, tmp);
+      }
     }
   }
   // handle rest of data
@@ -185,12 +192,14 @@ MSCCLPP_DEVICE_INLINE void handleReadReduceCopySend(T* input, uint32_t inputOffs
     T tmp = input[idx];
     for (int index = 0; index < nSrcChannels; ++index) {
       size_t srcOffset = srcOffsets[index] / sizeof(T);
-      tmp += smChannels[srcChannelIndex[index]].read<T>(srcOffset + idx);
+      tmp += smChannels[srcChannelIndexes[index]].read<T>(srcOffset + idx);
     }
     output[idx] = tmp;
-    for (int index = 0; index < nDstChannels; ++index) {
-      size_t dstOffset = dstOffsets[index] / sizeof(T);
-      smChannels[dstChannelIndex[index]].write<T>(dstOffset + idx, tmp);
+    if (sendToRemote) {
+      for (int index = 0; index < nDstChannels; ++index) {
+        size_t dstOffset = dstOffsets[index] / sizeof(T);
+        smChannels[dstChannelIndexes[index]].write<T>(dstOffset + idx, tmp);
+      }
     }
   }
 }
@@ -228,6 +237,10 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu
         handleWait(tid, smChannels, proxyChannels, operations[i].inputChannelIndexes, operations[i].nInputChannels,
                    operations[i].channelType);
         break;
+      case OperationType::GET:
+        handleGet(smChannels[operations[i].inputChannelIndexes[0]], operations[i].inputOffsets[0],
+                  operations[i].dstOffset, operations[i].size);
+        break;
       case OperationType::READ_REDUCE_COPY_SEND:
         src = getBuffer(input, output, scratch, operations[i].srcBufferType);
         dst = getBuffer(input, output, scratch, operations[i].dstBufferType);
@@ -236,6 +249,14 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu
                                  operations[i].inputOffsets, operations[i].outputOffsets, operations[i].nInputChannels,
                                  operations[i].nOutputChannels, operations[i].size);
         break;
+      case OperationType::READ_REDUCE_COPY:
+        src = getBuffer(input, output, scratch, operations[i].srcBufferType);
+        dst = getBuffer(input, output, scratch, operations[i].dstBufferType);
+        handleReadReduceCopySend(src, operations[i].srcOffset, dst, operations[i].dstOffset, smChannels,
+                                 operations[i].inputChannelIndexes, operations[i].outputChannelIndexes,
+                                 operations[i].inputOffsets, operations[i].outputOffsets, operations[i].nInputChannels,
+                                 operations[i].nOutputChannels, operations[i].size, false);
+        break;
       default:
         break;
     }

From d97f31274eee1c0d16c305f7af41ede16796fb8d Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Sun, 7 Apr 2024 11:48:45 +0000
Subject: [PATCH 32/51] WIP

---
 include/mscclpp/core.hpp          |   4 +
 include/mscclpp/executor.hpp      |   7 +-
 include/mscclpp/packet_device.hpp |  15 ++++
 python/mscclpp/__init__.py        |   1 +
 python/mscclpp/executor.cpp       |   9 ++-
 src/executor/execution_kernel.cu  |  25 ++++--
 src/executor/executor.cc          |  25 ++++--
 src/include/execution_common.hpp  |   4 +
 src/include/execution_kernel.hpp  | 129 +++++++++++++++++++++++-------
 9 files changed, 174 insertions(+), 45 deletions(-)

diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index 50a922bc3..456020975 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -760,6 +760,10 @@ DeviceHandle<std::remove_reference_t<T>> deviceHandle(T&& t) {
   return t.deviceHandle();
 }
 
+/// Packet value type.
+template <class T>
+using PacketValType = typename T::ValueType;
+
 }  // namespace mscclpp
 
 namespace std {
diff --git a/include/mscclpp/executor.hpp b/include/mscclpp/executor.hpp
index 21087a762..60a68fbb2 100644
--- a/include/mscclpp/executor.hpp
+++ b/include/mscclpp/executor.hpp
@@ -17,6 +17,11 @@ enum class DataType {
   FLOAT32,
 };
 
+enum class PacketType {
+  LL8,
+  LL16,
+};
+
 class ExecutionPlan {
  public:
   ExecutionPlan(std::string planPath);
@@ -37,7 +42,7 @@ class Executor {
   ~Executor();
 
   void execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, DataType dataType,
-               int nthreads, const ExecutionPlan& plan, cudaStream_t stream);
+               int nthreads, const ExecutionPlan& plan, cudaStream_t stream, PacketType packetType = PacketType::LL16);
 
  private:
   struct Impl;
diff --git a/include/mscclpp/packet_device.hpp b/include/mscclpp/packet_device.hpp
index 11f63b53f..7678c81b5 100644
--- a/include/mscclpp/packet_device.hpp
+++ b/include/mscclpp/packet_device.hpp
@@ -24,12 +24,20 @@ union alignas(16) LL16Packet {
     uint32_t data2;
     uint32_t flag2;
   };
+  using ValueType = uint2;
 
 #if defined(MSCCLPP_DEVICE_COMPILE)
   ulonglong2 raw_;
 
   MSCCLPP_DEVICE_INLINE LL16Packet() {}
 
+  MSCCLPP_DEVICE_INLINE LL16Packet(uint2 val, uint32_t flag) {
+    data1 = val.x;
+    flag1 = flag;
+    data2 = val.y;
+    flag2 = flag;
+  }
+
   /// Write 8 bytes of data to the packet.
   /// @param val1 The first 4-byte data to write.
   /// @param val2 The second 4-byte data to write.
@@ -95,10 +103,17 @@ union alignas(8) LL8Packet {
     uint32_t flag;
   };
   uint64_t raw_;
+
+  using ValueType = uint32_t;
 #if defined(MSCCLPP_DEVICE_COMPILE)
 
   MSCCLPP_DEVICE_INLINE LL8Packet() {}
 
+  MSCCLPP_DEVICE_INLINE LL8Packet(uint32_t val, uint32_t flag) {
+    data = val;
+    flag = flag;
+  }
+
   MSCCLPP_DEVICE_INLINE void write(uint32_t val, uint32_t flag) {
 #if defined(MSCCLPP_DEVICE_CUDA)
     asm volatile("st.volatile.global.v2.u32 [%0], {%1,%2};" ::"l"(&raw_), "r"(val), "r"(flag));
diff --git a/python/mscclpp/__init__.py b/python/mscclpp/__init__.py
index 0c8f7eb3b..0acc55fc5 100644
--- a/python/mscclpp/__init__.py
+++ b/python/mscclpp/__init__.py
@@ -22,6 +22,7 @@
     DataType,
     Executor,
     ExecutionPlan,
+    PacketType,
     version,
     is_nvls_supported,
 )
diff --git a/python/mscclpp/executor.cpp b/python/mscclpp/executor.cpp
index 5276e3336..f57a4294b 100644
--- a/python/mscclpp/executor.cpp
+++ b/python/mscclpp/executor.cpp
@@ -18,6 +18,8 @@ void register_executor(nb::module_& m) {
       .value("float16", DataType::FLOAT16)
       .value("float32", DataType::FLOAT32);
 
+  nb::enum_<PacketType>(m, "PacketType").value("LL8", PacketType::LL8).value("LL16", PacketType::LL16);
+
   nb::class_<ExecutionPlan>(m, "ExecutionPlan").def(nb::init<std::string>(), nb::arg("planPath"));
 
   nb::class_<Executor>(m, "Executor")
@@ -25,10 +27,11 @@ void register_executor(nb::module_& m) {
       .def(
           "execute",
           [](Executor* self, int rank, uintptr_t sendbuff, uintptr_t recvBuff, size_t sendBuffSize, size_t recvBuffSize,
-             DataType dataType, int nthreads, const ExecutionPlan& plan, uintptr_t stream) {
+             DataType dataType, int nthreads, const ExecutionPlan& plan, uintptr_t stream, PacketType packetType) {
             self->execute(rank, reinterpret_cast<void*>(sendbuff), reinterpret_cast<void*>(recvBuff), sendBuffSize,
-                          recvBuffSize, dataType, nthreads, plan, (cudaStream_t)stream);
+                          recvBuffSize, dataType, nthreads, plan, (cudaStream_t)stream, packetType);
           },
           nb::arg("rank"), nb::arg("sendbuff"), nb::arg("recvBuff"), nb::arg("sendBuffSize"), nb::arg("recvBuffSize"),
-          nb::arg("dataType"), nb::arg("nthreads"), nb::arg("plan"), nb::arg("stream"));
+          nb::arg("dataType"), nb::arg("nthreads"), nb::arg("plan"), nb::arg("stream"),
+          nb::arg("packetType") = PacketType::LL16);
 }
diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu
index d5e07a3da..7aca5b1ed 100644
--- a/src/executor/execution_kernel.cu
+++ b/src/executor/execution_kernel.cu
@@ -5,27 +5,36 @@
 
 #if defined(MSCCLPP_DEVICE_CUDA)
 namespace mscclpp {
+
+template <typename PacketType>
 void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch,
                                    DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize,
-                                   cudaStream_t stream) {
+                                   cudaStream_t stream, uint32_t flag) {
   switch (dataType) {
     case DataType::INT32:
-      executionKernel<int32_t><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (int32_t*)src, (int32_t*)dst,
-                                                                                   (int32_t*)scratch, plan);
+      executionKernel<int32_t, PacketType><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
+          rank, (int32_t*)src, (int32_t*)dst, (int32_t*)scratch, plan, flag);
       break;
     case DataType::UINT32:
       executionKernel<uint32_t><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
-          rank, (uint32_t*)src, (uint32_t*)dst, (uint32_t*)scratch, plan);
+          rank, (uint32_t*)src, (uint32_t*)dst, (uint32_t*)scratch, plan, flag);
       break;
     case DataType::FLOAT16:
-      executionKernel<half>
-          <<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (half*)src, (half*)dst, (half*)scratch, plan);
+      executionKernel<half><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (half*)src, (half*)dst,
+                                                                                (half*)scratch, plan, flag);
       break;
     case DataType::FLOAT32:
-      executionKernel<float>
-          <<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (float*)src, (float*)dst, (float*)scratch, plan);
+      executionKernel<float><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (float*)src, (float*)dst,
+                                                                                 (float*)scratch, plan, flag);
       break;
   }
 }
+
+template void ExecutionKernel::launchKernel<LL16Packet>(int rank, int nthreadblocks, int nthreads, void* src, void* dst,
+                                                        void* scratch, DataType dataType, DeviceExecutionPlan* plan,
+                                                        size_t sharedMemSize, cudaStream_t stream, uint32_t flag);
+template void ExecutionKernel::launchKernel<LL8Packet>(int rank, int nthreadblocks, int nthreads, void* src, void* dst,
+                                                       void* scratch, DataType dataType, DeviceExecutionPlan* plan,
+                                                       size_t sharedMemSize, cudaStream_t stream, uint32_t flag);
 }  // namespace mscclpp
 #endif
diff --git a/src/executor/executor.cc b/src/executor/executor.cc
index 1d6d9305e..d775cd593 100644
--- a/src/executor/executor.cc
+++ b/src/executor/executor.cc
@@ -258,12 +258,24 @@ struct Executor::Impl {
   }
 
   void launchKernel(ExecutionContext& context, int rank, int nthreadsPerBlock, void* sendbuff, void* recvbuff,
-                    DataType dataType, cudaStream_t stream) {
+                    DataType dataType, cudaStream_t stream, PacketType packetType) {
+    static uint32_t flag = 0;
     int nthreadblocks = context.deviceExecutionPlans.size();
     size_t sharedMemSize = sizeof(DeviceExecutionPlan);
-    ExecutionKernel::launchKernel(
-        rank, nthreadblocks, nthreadsPerBlock, sendbuff, recvbuff, (void*)context.scratchBuffer.get(), dataType,
-        (DeviceExecutionPlan*)context.deviceExecutionPlansBuffer.get(), sharedMemSize, stream);
+    switch (packetType) {
+      case PacketType::LL16:
+        ExecutionKernel::launchKernel<LL16Packet>(
+            rank, nthreadblocks, nthreadsPerBlock, sendbuff, recvbuff, (void*)context.scratchBuffer.get(), dataType,
+            (DeviceExecutionPlan*)context.deviceExecutionPlansBuffer.get(), sharedMemSize, stream, ++flag);
+        break;
+      case PacketType::LL8:
+        ExecutionKernel::launchKernel<LL8Packet>(
+            rank, nthreadblocks, nthreadsPerBlock, sendbuff, recvbuff, (void*)context.scratchBuffer.get(), dataType,
+            (DeviceExecutionPlan*)context.deviceExecutionPlansBuffer.get(), sharedMemSize, stream, ++flag);
+        break;
+      default:
+        throw std::runtime_error("Invalid packet type");
+    }
   }
 };
 
@@ -271,10 +283,11 @@ Executor::Executor(std::shared_ptr<Communicator> comm, int nranksPerNode)
     : impl_(std::make_unique<Impl>(comm, nranksPerNode)) {}
 
 void Executor::execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize,
-                       DataType dataType, int nthreads, const ExecutionPlan& plan, cudaStream_t stream) {
+                       DataType dataType, int nthreads, const ExecutionPlan& plan, cudaStream_t stream,
+                       PacketType packetType) {
   ExecutionContext context =
       this->impl_->setupExecutionContext(rank, sendbuff, recvBuff, sendBuffSize, recvBuffSize, plan, stream);
-  this->impl_->launchKernel(context, rank, nthreads, sendbuff, recvBuff, dataType, stream);
+  this->impl_->launchKernel(context, rank, nthreads, sendbuff, recvBuff, dataType, stream, packetType);
 }
 
 Executor::~Executor() = default;
diff --git a/src/include/execution_common.hpp b/src/include/execution_common.hpp
index 59d341612..5a63859b8 100644
--- a/src/include/execution_common.hpp
+++ b/src/include/execution_common.hpp
@@ -27,13 +27,17 @@ enum class ChannelType : uint8_t {
 enum class OperationType : uint8_t {
   BARRIER,
   PUT,
+  PUT_PACKET,
   GET,
   COPY,
+  COPY_PACKET,
   SIGNAL,
   WAIT,
   FLUSH,
   REDUCE,
+  REDUCE_PACKET,
   REDUCE_SEND,
+  REDUCE_SEND_PACKET,
   READ_REDUCE_COPY,
   READ_REDUCE_COPY_SEND,
 };
diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp
index 833130fc9..023ca1a15 100644
--- a/src/include/execution_kernel.hpp
+++ b/src/include/execution_kernel.hpp
@@ -5,6 +5,7 @@
 #define MSCCLPP_EXECUTION_KERNEL_HPP_
 
 #include <mscclpp/executor.hpp>
+#include <mscclpp/packet_device.hpp>
 #include <mscclpp/proxy_channel.hpp>
 #include <mscclpp/sm_channel.hpp>
 
@@ -102,7 +103,7 @@ MSCCLPP_DEVICE_INLINE uint32_t add_vectors(uint32_t a, uint32_t b) {
 }
 
 template <>
-MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint32_t add_vectors<__half>(uint32_t a, uint32_t b) {
+MSCCLPP_DEVICE_INLINE uint32_t add_vectors<__half>(uint32_t a, uint32_t b) {
   return add_vectors_helper<__half2>(a, b);
 }
 
@@ -112,6 +113,7 @@ MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint32_t add_vectors<__half>(uint3
 namespace mscclpp {
 
 #if defined(MSCCLPP_DEVICE_COMPILE)
+
 template <typename T>
 MSCCLPP_DEVICE_INLINE T* getBuffer(T* input, T* output, T* scratch, BufferType bufferType) {
   if (bufferType == BufferType::INPUT) {
@@ -158,11 +160,11 @@ MSCCLPP_DEVICE_INLINE void handleGet(DeviceHandle<SmChannel>& smChannel, uint32_
 }
 
 template <typename T>
-MSCCLPP_DEVICE_INLINE void handleReadReduceCopySend(T* input, uint32_t inputOffsetByBytes, T* output,
-                                                    uint32_t outputOffsetByBytes, DeviceHandle<SmChannel>* smChannels,
-                                                    uint8_t* srcChannelIndexes, uint8_t* dstChannelIndexes,
-                                                    uint32_t* srcOffsets, uint32_t* dstOffsets, int nSrcChannels,
-                                                    int nDstChannels, uint32_t size, bool sendToRemote = true) {
+MSCCLPP_DEVICE_INLINE void handleReadReduceCopySend(T* output, uint32_t outputOffsetByBytes, T* input,
+                                                    uint32_t inputOffsetByBytes, DeviceHandle<SmChannel>* smChannels,
+                                                    uint8_t* dstChannelIndexes, uint8_t* srcChannelIndexes,
+                                                    uint32_t* dstOffsets, uint32_t* srcOffsets, int nDstChannels,
+                                                    int nSrcChannels, uint32_t size, bool sendToRemote = true) {
   const size_t nInt4 = size / sizeof(int4);
   const size_t inputOffset4 = inputOffsetByBytes / sizeof(int4);
   const size_t outputOffset4 = outputOffsetByBytes / sizeof(int4);
@@ -204,9 +206,59 @@ MSCCLPP_DEVICE_INLINE void handleReadReduceCopySend(T* input, uint32_t inputOffs
   }
 }
 
-template <typename T>
+template <typename PacketType>
+MSCCLPP_DEVICE_INLINE void handlePutPacket(uint32_t inputOffsetByBytes, DeviceHandle<SmChannel>* smChannels,
+                                           uint8_t* dstChannelIndexes, uint32_t* dstOffsets, int nDstChannels,
+                                           uint32_t size, uint32_t flag) {
+  for (int index = 0; index < nDstChannels; ++index) {
+    smChannels[dstChannelIndexes[index]].putPackets<PacketType>(dstOffsets[index], inputOffsetByBytes, size,
+                                                                threadIdx.x, blockDim.x, flag);
+  }
+}
+
+template <typename T, typename PacketType>
+MSCCLPP_DEVICE_INLINE void handleReduceSendPacket(T* output, uint32_t outputOffsetByBytes, T* input,
+                                                  uint32_t inputOffsetByBytes, DeviceHandle<SmChannel>* smChannels,
+                                                  uint8_t* dstChannelIndexes, uint32_t* dstOffsets,
+                                                  uint32_t* srcOffsets, int nDstChannels, int nSrcs, size_t size,
+                                                  uint32_t flag) {
+  size_t nPackets = size * 2 / sizeof(PacketType);
+  uint32_t srcOffset = inputOffsetByBytes / sizeof(PacketValType<PacketType>);
+  uint32_t dstOffset = outputOffsetByBytes / sizeof(PacketValType<PacketType>);
+  PacketValType<PacketType>* src = (PacketValType<PacketType>*)input + srcOffset;
+  PacketValType<PacketType>* dst = (PacketValType<PacketType>*)output + dstOffset;
+  for (int idx = threadIdx.x; idx < nPackets; idx += blockDim.x) {
+    PacketValType<PacketType> data = {};
+    for (int index = 0; index < nSrcs; ++index) {
+      PacketType* pkt = (PacketType*)input + srcOffsets[index] / sizeof(PacketType);
+      PacketValType<PacketType> val = pkt[idx].read(flag);
+      data = add_vectors<T>(data, val);
+    }
+    data = add_vectors<T>(data, src[idx]);
+    dst[idx] = data;
+
+    PacketType pkt(data, flag);
+    for (int index = 0; index < nDstChannels; ++index) {
+      smChannels[dstChannelIndexes[index]].write(dstOffsets[index] / sizeof(PacketValType<PacketType>) + idx, pkt);
+    }
+  }
+}
+
+template <typename PacketType>
+MSCCLPP_DEVICE_INLINE void handleCopyPacket(void* dst, void* src, uint32_t dstOffset, uint32_t srcOffset, size_t size,
+                                            uint32_t flag) {
+  PacketType* srcPackets = (PacketType*)src;
+  PacketValType<PacketType>* result = (PacketValType<PacketType>*)dst;
+  size_t nPackets = size * 2 / sizeof(PacketType);
+  for (size_t idx = threadIdx.x; idx < nPackets; idx += blockDim.x) {
+    PacketValType<PacketType> data = srcPackets[idx].read(flag);
+    result[idx] = data;
+  }
+}
+
+template <typename T, typename PacketType = LL16Packet>
 __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* input, T* output, T* scratch,
-                                DeviceExecutionPlan* plan) {
+                                DeviceExecutionPlan* plan, uint32_t flag) {
   extern __shared__ int sharedMem[];
   int bid = blockIdx.x;
   int tid = threadIdx.x;
@@ -242,20 +294,39 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu
                   operations[i].dstOffset, operations[i].size);
         break;
       case OperationType::READ_REDUCE_COPY_SEND:
-        src = getBuffer(input, output, scratch, operations[i].srcBufferType);
         dst = getBuffer(input, output, scratch, operations[i].dstBufferType);
-        handleReadReduceCopySend(src, operations[i].srcOffset, dst, operations[i].dstOffset, smChannels,
-                                 operations[i].inputChannelIndexes, operations[i].outputChannelIndexes,
-                                 operations[i].inputOffsets, operations[i].outputOffsets, operations[i].nInputChannels,
-                                 operations[i].nOutputChannels, operations[i].size);
+        src = getBuffer(input, output, scratch, operations[i].srcBufferType);
+        handleReadReduceCopySend(dst, operations[i].dstOffset, src, operations[i].srcOffset, smChannels,
+                                 operations[i].outputChannelIndexes, operations[i].inputChannelIndexes,
+                                 operations[i].outputOffsets, operations[i].inputOffsets, operations[i].nOutputChannels,
+                                 operations[i].nInputChannels, operations[i].size);
         break;
       case OperationType::READ_REDUCE_COPY:
+        dst = getBuffer(input, output, scratch, operations[i].dstBufferType);
         src = getBuffer(input, output, scratch, operations[i].srcBufferType);
+        handleReadReduceCopySend(dst, operations[i].dstOffset, src, operations[i].srcOffset, smChannels,
+                                 operations[i].outputChannelIndexes, operations[i].inputChannelIndexes,
+                                 operations[i].outputOffsets, operations[i].inputOffsets, operations[i].nOutputChannels,
+                                 operations[i].nInputChannels, operations[i].size, false);
+        break;
+      case OperationType::PUT_PACKET:
+        handlePutPacket<PacketType>(operations[i].srcOffset, smChannels, operations[i].outputChannelIndexes,
+                                    operations[i].outputOffsets, operations[i].nOutputChannels, operations[i].size,
+                                    flag);
+        break;
+      case OperationType::REDUCE_SEND_PACKET:
         dst = getBuffer(input, output, scratch, operations[i].dstBufferType);
-        handleReadReduceCopySend(src, operations[i].srcOffset, dst, operations[i].dstOffset, smChannels,
-                                 operations[i].inputChannelIndexes, operations[i].outputChannelIndexes,
-                                 operations[i].inputOffsets, operations[i].outputOffsets, operations[i].nInputChannels,
-                                 operations[i].nOutputChannels, operations[i].size, false);
+        src = getBuffer(input, output, scratch, operations[i].srcBufferType);
+        handleReduceSendPacket<T, PacketType>(dst, operations[i].dstOffset, src, operations[i].srcOffset, smChannels,
+                                              operations[i].outputChannelIndexes, operations[i].outputOffsets,
+                                              operations[i].inputOffsets, operations[i].nOutputChannels,
+                                              operations[i].nInputChannels, operations[i].size, flag);
+        break;
+      case OperationType::COPY_PACKET:
+        dst = getBuffer(input, output, scratch, operations[i].dstBufferType);
+        src = getBuffer(input, output, scratch, operations[i].srcBufferType);
+        handleCopyPacket<PacketType>(dst, src, operations[i].dstOffset, operations[i].srcOffset, operations[i].size,
+                                     flag);
         break;
       default:
         break;
@@ -267,30 +338,34 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu
 class ExecutionKernel {
  public:
 #if defined(MSCCLPP_DEVICE_HIP)
+  template <typename PacketType>
   static void launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch,
-                           DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, cudaStream_t stream) {
+                           DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, cudaStream_t stream,
+                           uint32_t flag = 0) {
     switch (dataType) {
       case DataType::INT32:
-        executionKernel<int32_t><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (int32_t*)src, (int32_t*)dst,
-                                                                                     (int32_t*)scratch, plan);
+        executionKernel<int32_t, PacketType><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
+            rank, (int32_t*)src, (int32_t*)dst, (int32_t*)scratch, plan, flag);
         break;
       case DataType::UINT32:
-        executionKernel<uint32_t><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
-            rank, (uint32_t*)src, (uint32_t*)dst, (uint32_t*)scratch, plan);
+        executionKernel<uint32_t, PacketType><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
+            rank, (uint32_t*)src, (uint32_t*)dst, (uint32_t*)scratch, plan, flag);
         break;
       case DataType::FLOAT16:
-        executionKernel<half>
-            <<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (half*)src, (half*)dst, (half*)scratch, plan);
+        executionKernel<half, PacketType><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
+            rank, (half*)src, (half*)dst, (half*)scratch, plan, flag);
         break;
       case DataType::FLOAT32:
-        executionKernel<float>
-            <<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (float*)src, (float*)dst, (float*)scratch, plan);
+        executionKernel<float, PacketType><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
+            rank, (float*)src, (float*)dst, (float*)scratch, plan, flag);
         break;
     }
   }
 #else   // !defined(MSCCLPP_DEVICE_HIP)
+  template <typename PacketType>
   static void launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch,
-                           DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, cudaStream_t stream);
+                           DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, cudaStream_t stream,
+                           uint32_t flag = 0);
 #endif  // !defined(MSCCLPP_DEVICE_HIP)
 };
 }  // namespace mscclpp

From 64106f1f419e40f4880f1ca3edb7407c0be1c64c Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Sun, 7 Apr 2024 12:44:09 +0000
Subject: [PATCH 33/51] WIP

---
 include/mscclpp/packet_device.hpp          |   4 +-
 src/executor/execution_plan.cc             |   6 +-
 src/include/execution_kernel.hpp           |  19 +-
 src/include/execution_plan.hpp             |   1 +
 test/execution-files/allreduce_packet.json | 330 +++++++++++++++++++++
 test/executor_test.cc                      |   4 +-
 6 files changed, 350 insertions(+), 14 deletions(-)
 create mode 100644 test/execution-files/allreduce_packet.json

diff --git a/include/mscclpp/packet_device.hpp b/include/mscclpp/packet_device.hpp
index 7678c81b5..a20c8abec 100644
--- a/include/mscclpp/packet_device.hpp
+++ b/include/mscclpp/packet_device.hpp
@@ -110,8 +110,8 @@ union alignas(8) LL8Packet {
   MSCCLPP_DEVICE_INLINE LL8Packet() {}
 
   MSCCLPP_DEVICE_INLINE LL8Packet(uint32_t val, uint32_t flag) {
-    data = val;
-    flag = flag;
+    this->data = val;
+    this->flag = flag;
   }
 
   MSCCLPP_DEVICE_INLINE void write(uint32_t val, uint32_t flag) {
diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc
index 4a64e86e6..ac4a8fdbf 100644
--- a/src/executor/execution_plan.cc
+++ b/src/executor/execution_plan.cc
@@ -69,7 +69,7 @@ auto convertToChannelType = [](const std::string& str) {
 namespace mscclpp {
 using json = nlohmann::json;
 
-ExecutionPlan::Impl::Impl(std::string planPath) : planPath(planPath) {}
+ExecutionPlan::Impl::Impl(std::string planPath) : planPath(planPath), isUsingPacket(false) {}
 
 std::vector<ChannelInfo> ExecutionPlan::Impl::getChannelInfos(int rank, ChannelType channelType) const {
   auto pred = [channelType](const ChannelInfo& info) { return info.channelType == channelType; };
@@ -111,6 +111,10 @@ void ExecutionPlan::Impl::loadExecutionPlan(size_t inputSize) {
   std::ifstream file(this->planPath);
   json obj = json::parse(file);
   this->name = obj["name"];
+  std::string protocol = obj["protocol"];
+  if (protocol == "LL") {
+    this->isUsingPacket = true;
+  }
   auto gpus = obj["gpus"];
 
   for (const auto& gpu : gpus) {
diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp
index 023ca1a15..be0533f35 100644
--- a/src/include/execution_kernel.hpp
+++ b/src/include/execution_kernel.hpp
@@ -211,8 +211,8 @@ MSCCLPP_DEVICE_INLINE void handlePutPacket(uint32_t inputOffsetByBytes, DeviceHa
                                            uint8_t* dstChannelIndexes, uint32_t* dstOffsets, int nDstChannels,
                                            uint32_t size, uint32_t flag) {
   for (int index = 0; index < nDstChannels; ++index) {
-    smChannels[dstChannelIndexes[index]].putPackets<PacketType>(dstOffsets[index], inputOffsetByBytes, size,
-                                                                threadIdx.x, blockDim.x, flag);
+    smChannels[dstChannelIndexes[index]].putPackets<PacketType>(
+        dstOffsets[index] * sizeof(PacketType), inputOffsetByBytes, size, threadIdx.x, blockDim.x, flag);
   }
 }
 
@@ -223,14 +223,14 @@ MSCCLPP_DEVICE_INLINE void handleReduceSendPacket(T* output, uint32_t outputOffs
                                                   uint32_t* srcOffsets, int nDstChannels, int nSrcs, size_t size,
                                                   uint32_t flag) {
   size_t nPackets = size * 2 / sizeof(PacketType);
-  uint32_t srcOffset = inputOffsetByBytes / sizeof(PacketValType<PacketType>);
-  uint32_t dstOffset = outputOffsetByBytes / sizeof(PacketValType<PacketType>);
+  const uint32_t srcOffset = inputOffsetByBytes / sizeof(PacketValType<PacketType>);
+  const uint32_t dstOffset = outputOffsetByBytes / sizeof(PacketValType<PacketType>);
   PacketValType<PacketType>* src = (PacketValType<PacketType>*)input + srcOffset;
   PacketValType<PacketType>* dst = (PacketValType<PacketType>*)output + dstOffset;
-  for (int idx = threadIdx.x; idx < nPackets; idx += blockDim.x) {
+  for (size_t idx = threadIdx.x; idx < nPackets; idx += blockDim.x) {
     PacketValType<PacketType> data = {};
     for (int index = 0; index < nSrcs; ++index) {
-      PacketType* pkt = (PacketType*)input + srcOffsets[index] / sizeof(PacketType);
+      PacketType* pkt = (PacketType*)((char*)input + 2 * srcOffsets[index]);
       PacketValType<PacketType> val = pkt[idx].read(flag);
       data = add_vectors<T>(data, val);
     }
@@ -239,7 +239,8 @@ MSCCLPP_DEVICE_INLINE void handleReduceSendPacket(T* output, uint32_t outputOffs
 
     PacketType pkt(data, flag);
     for (int index = 0; index < nDstChannels; ++index) {
-      smChannels[dstChannelIndexes[index]].write(dstOffsets[index] / sizeof(PacketValType<PacketType>) + idx, pkt);
+      size_t offset = (dstOffsets[index] * 2) / sizeof(PacketType);
+      smChannels[dstChannelIndexes[index]].write(offset + idx, pkt);
     }
   }
 }
@@ -247,8 +248,8 @@ MSCCLPP_DEVICE_INLINE void handleReduceSendPacket(T* output, uint32_t outputOffs
 template <typename PacketType>
 MSCCLPP_DEVICE_INLINE void handleCopyPacket(void* dst, void* src, uint32_t dstOffset, uint32_t srcOffset, size_t size,
                                             uint32_t flag) {
-  PacketType* srcPackets = (PacketType*)src;
-  PacketValType<PacketType>* result = (PacketValType<PacketType>*)dst;
+  PacketType* srcPackets = (PacketType*)((char*)src + 2 * srcOffset);
+  PacketValType<PacketType>* result = (PacketValType<PacketType>*)((char*)dst + dstOffset);
   size_t nPackets = size * 2 / sizeof(PacketType);
   for (size_t idx = threadIdx.x; idx < nPackets; idx += blockDim.x) {
     PacketValType<PacketType> data = srcPackets[idx].read(flag);
diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp
index 6a4aaa80a..8c0029f0a 100644
--- a/src/include/execution_plan.hpp
+++ b/src/include/execution_plan.hpp
@@ -62,6 +62,7 @@ struct ExecutionPlan::Impl {
   void setupOperations(const nlohmann::json& gpus);
 
   std::string planPath;
+  bool isUsingPacket;
   // operations for [rank][threadblock] = [operations]
   std::unordered_map<int, std::vector<std::vector<Operation>>> operations;
   std::unordered_map<int, std::vector<ChannelInfo>> channelInfos;
diff --git a/test/execution-files/allreduce_packet.json b/test/execution-files/allreduce_packet.json
new file mode 100644
index 000000000..3eda0ff3d
--- /dev/null
+++ b/test/execution-files/allreduce_packet.json
@@ -0,0 +1,330 @@
+{
+    "name": "allreduce_pairs",
+    "colletive": "allreduce",
+    "protocol": "LL",
+    "inplace": true,
+    "gpus": [
+      {
+        "id": 0,
+        "inputChunks": 4,
+        "outputChunks": 0,
+        "scratchChunks": 8,
+        "threadblocks": [
+          {
+            "id": 0,
+            "ops": [
+              {
+                "name": "ppkt",
+                "o_buff": {
+                  "src": "i",
+                  "dst": "s"
+                },
+                "o_cids": [
+                  {
+                    "id": 0,
+                    "off": 0
+                  }
+                ],
+                "src": 0,
+                "srcbuff": "i",
+                "srcoff": 2,
+                "ctype": "sm",
+                "cnt": 1
+              },
+              {
+                "name": "rspkt",
+                "o_buff": {
+                  "src": "i",
+                  "dst": "s"
+                },
+                "o_cids": [
+                  {
+                    "id": 0,
+                    "off": 4
+                  }
+                ],
+                "src": 0,
+                "srcs": [
+                  {
+                    "buff": "s",
+                    "off": 2
+                  }
+                ],
+                "srcbuff": "i",
+                "srcoff": 0,
+                "dst": 0,
+                "dstbuff": "i",
+                "dstoff": 0,
+                "ctype": "none",
+                "cnt": 1
+              },
+              {
+                "name": "cpkt",
+                "src": 0,
+                "srcbuff": "s",
+                "srcoff": 6,
+                "dst": 0,
+                "dstbuff": "i",
+                "dstoff": 2,
+                "ctype": "none",
+                "cnt": 1
+              }
+            ],
+            "channels": [
+              {
+                "src": "i",
+                "dst": "s",
+                "ctype": "sm",
+                "cids": [
+                  0
+                ]
+              }
+            ]
+          },
+          {
+            "id": 1,
+            "ops": [
+              {
+                "name": "ppkt",
+                "o_buff": {
+                  "src": "i",
+                  "dst": "s"
+                },
+                "o_cids": [
+                  {
+                    "id": 0,
+                    "off": 1
+                  }
+                ],
+                "src": 0,
+                "srcbuff": "i",
+                "srcoff": 3,
+                "ctype": "sm",
+                "cnt": 1
+              },
+              {
+                "name": "rspkt",
+                "o_buff": {
+                  "src": "i",
+                  "dst": "s"
+                },
+                "o_cids": [
+                  {
+                    "id": 0,
+                    "off": 5
+                  }
+                ],
+                "src": 0,
+                "srcs": [
+                  {
+                    "buff": "s",
+                    "off": 3
+                  }
+                ],
+                "srcbuff": "i",
+                "srcoff": 1,
+                "dst": 0,
+                "dstbuff": "i",
+                "dstoff": 1,
+                "ctype": "none",
+                "cnt": 1
+              },
+              {
+                "name": "cpkt",
+                "src": 0,
+                "srcbuff": "s",
+                "srcoff": 7,
+                "dst": 0,
+                "dstbuff": "i",
+                "dstoff": 3,
+                "ctype": "none",
+                "cnt": 1
+              }
+            ],
+            "channels": [
+              {
+                "src": "i",
+                "dst": "s",
+                "ctype": "sm",
+                "cids": [
+                  1
+                ]
+              }
+            ]
+          }
+        ],
+        "channels": [
+          {
+            "srcbuff": "i",
+            "dstbuff": "s",
+            "type": "sm",
+            "connectedTo": [
+              1,
+              1
+            ]
+          }
+        ]
+      },
+      {
+        "id": 1,
+        "inputChunks": 4,
+        "outputChunks": 0,
+        "scratchChunks": 6,
+        "threadblocks": [
+          {
+            "id": 0,
+            "ops": [
+              {
+                "name": "ppkt",
+                "o_buff": {
+                  "src": "i",
+                  "dst": "s"
+                },
+                "o_cids": [
+                  {
+                    "id": 0,
+                    "off": 2
+                  }
+                ],
+                "src": 1,
+                "srcbuff": "i",
+                "srcoff": 0,
+                "ctype": "sm",
+                "cnt": 1
+              },
+              {
+                "name": "rspkt",
+                "o_buff": {
+                  "src": "i",
+                  "dst": "s"
+                },
+                "o_cids": [
+                  {
+                    "id": 0,
+                    "off": 6
+                  }
+                ],
+                "src": 1,
+                "srcs": [
+                  {
+                    "buff": "s",
+                    "off": 0
+                  }
+                ],
+                "srcbuff": "i",
+                "srcoff": 2,
+                "dst": 1,
+                "dstbuff": "i",
+                "dstoff": 2,
+                "ctype": "none",
+                "cnt": 1
+              },
+              {
+                "name": "cpkt",
+                "src": 1,
+                "srcbuff": "s",
+                "srcoff": 4,
+                "dst": 1,
+                "dstbuff": "i",
+                "dstoff": 0,
+                "ctype": "none",
+                "cnt": 1
+              }
+            ],
+            "channels": [
+              {
+                "src": "i",
+                "dst": "s",
+                "ctype": "sm",
+                "cids": [
+                  0
+                ]
+              }
+            ]
+          },
+          {
+            "id": 1,
+            "ops": [
+              {
+                "name": "ppkt",
+                "o_buff": {
+                  "src": "i",
+                  "dst": "s"
+                },
+                "o_cids": [
+                  {
+                    "id": 0,
+                    "off": 3
+                  }
+                ],
+                "src": 1,
+                "srcbuff": "i",
+                "srcoff": 1,
+                "ctype": "sm",
+                "cnt": 1
+              },
+              {
+                "name": "rspkt",
+                "o_buff": {
+                  "src": "i",
+                  "dst": "s"
+                },
+                "o_cids": [
+                  {
+                    "id": 0,
+                    "off": 7
+                  }
+                ],
+                "src": 1,
+                "srcs": [
+                  {
+                    "buff": "s",
+                    "off": 1
+                  }
+                ],
+                "srcbuff": "i",
+                "srcoff": 3,
+                "dst": 1,
+                "dstbuff": "i",
+                "dstoff": 3,
+                "ctype": "none",
+                "cnt": 1
+              },
+              {
+                "name": "cpkt",
+                "src": 1,
+                "srcbuff": "s",
+                "srcoff": 5,
+                "dst": 1,
+                "dstbuff": "i",
+                "dstoff": 1,
+                "ctype": "none",
+                "cnt": 1
+              }
+            ],
+            "channels": [
+              {
+                "src": "i",
+                "dst": "s",
+                "ctype": "sm",
+                "cids": [
+                  1
+                ]
+              }
+            ]
+          }
+        ],
+        "channels": [
+          {
+            "srcbuff": "i",
+            "dstbuff": "s",
+            "type": "sm",
+            "connectedTo": [
+              0,
+              0
+            ]
+          }
+        ]
+      }
+    ]
+  }
diff --git a/test/executor_test.cc b/test/executor_test.cc
index 4a7b36a79..c708a97d3 100644
--- a/test/executor_test.cc
+++ b/test/executor_test.cc
@@ -29,12 +29,12 @@ int main() {
   MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
   bootstrap->initialize(id);
   // sleep 20s
-  // std::this_thread::sleep_for(std::chrono::seconds(20));
+  std::this_thread::sleep_for(std::chrono::seconds(20));
   auto comm = std::make_shared<mscclpp::Communicator>(bootstrap);
   CUDACHECK(cudaSetDevice(rank));
 
   std::shared_ptr<mscclpp::Executor> executor = std::make_shared<mscclpp::Executor>(comm, 8 /*nranksPerNode*/);
-  mscclpp::ExecutionPlan plan(MSCCLPP_ROOT_PATH + "/test/execution-files/allreduce.json");
+  mscclpp::ExecutionPlan plan(MSCCLPP_ROOT_PATH + "/test/execution-files/allreduce_packet.json");
   const int bufferSize = 1024 * 1024;
   std::shared_ptr<char> sendbuff = mscclpp::allocExtSharedCuda<char>(bufferSize);
   mscclpp::CudaStreamWithFlags stream(cudaStreamNonBlocking);

From feaf058e1cee4c84e955bfc80fa8317a408130fa Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Sun, 7 Apr 2024 12:45:55 +0000
Subject: [PATCH 34/51] update

---
 src/executor/execution_plan.cc   |  8 ++++----
 src/include/execution_common.hpp |  4 ++--
 src/include/execution_kernel.hpp | 19 +++++++++----------
 3 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc
index ac4a8fdbf..4c75797bb 100644
--- a/src/executor/execution_plan.cc
+++ b/src/executor/execution_plan.cc
@@ -209,19 +209,19 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus) {
           operation.channelType = convertToChannelType(op["ctype"]);
         }
         if (op.contains("i_cids")) {
-          operation.nInputChannels = op["i_cids"].size();
+          operation.nInputs = op["i_cids"].size();
         }
         if (op.contains("o_cids")) {
-          operation.nOutputChannels = op["o_cids"].size();
+          operation.nOutputs = op["o_cids"].size();
         }
-        for (int i = 0; i < operation.nInputChannels; i++) {
+        for (int i = 0; i < operation.nInputs; i++) {
           BufferType srcBufferType = convertToBufferType(op["i_buff"]["src"]);
           BufferType dstBufferType = convertToBufferType(op["i_buff"]["dst"]);
           operation.inputChannelIndexes[i] =
               channelIndexes[{srcBufferType, dstBufferType, operation.channelType}][op["i_cids"][i]["id"]];
           operation.inputOffsets[i] = this->chunkSize * (int)op["i_cids"][i]["off"];
         }
-        for (int i = 0; i < operation.nOutputChannels; i++) {
+        for (int i = 0; i < operation.nOutputs; i++) {
           BufferType srcBufferType = convertToBufferType(op["o_buff"]["src"]);
           BufferType dstBufferType = convertToBufferType(op["o_buff"]["dst"]);
           operation.outputChannelIndexes[i] =
diff --git a/src/include/execution_common.hpp b/src/include/execution_common.hpp
index 5a63859b8..ba61fb84c 100644
--- a/src/include/execution_common.hpp
+++ b/src/include/execution_common.hpp
@@ -52,8 +52,8 @@ struct Operation {
   ChannelType channelType;
   BufferType srcBufferType;
   BufferType dstBufferType;
-  uint8_t nInputChannels;
-  uint8_t nOutputChannels;
+  uint8_t nInputs;
+  uint8_t nOutputs;
   uint8_t inputChannelIndexes[MAX_CHANNEL_PER_OPERATION];
   uint8_t outputChannelIndexes[MAX_CHANNEL_PER_OPERATION];
   uint32_t inputOffsets[MAX_CHANNEL_PER_OPERATION];
diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp
index be0533f35..e2ceaf224 100644
--- a/src/include/execution_kernel.hpp
+++ b/src/include/execution_kernel.hpp
@@ -283,11 +283,11 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu
         __syncthreads();
         break;
       case OperationType::SIGNAL:
-        handleSignal(tid, smChannels, proxyChannels, operations[i].outputChannelIndexes, operations[i].nOutputChannels,
+        handleSignal(tid, smChannels, proxyChannels, operations[i].outputChannelIndexes, operations[i].nOutputs,
                      operations[i].channelType);
         break;
       case OperationType::WAIT:
-        handleWait(tid, smChannels, proxyChannels, operations[i].inputChannelIndexes, operations[i].nInputChannels,
+        handleWait(tid, smChannels, proxyChannels, operations[i].inputChannelIndexes, operations[i].nInputs,
                    operations[i].channelType);
         break;
       case OperationType::GET:
@@ -299,29 +299,28 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu
         src = getBuffer(input, output, scratch, operations[i].srcBufferType);
         handleReadReduceCopySend(dst, operations[i].dstOffset, src, operations[i].srcOffset, smChannels,
                                  operations[i].outputChannelIndexes, operations[i].inputChannelIndexes,
-                                 operations[i].outputOffsets, operations[i].inputOffsets, operations[i].nOutputChannels,
-                                 operations[i].nInputChannels, operations[i].size);
+                                 operations[i].outputOffsets, operations[i].inputOffsets, operations[i].nOutputs,
+                                 operations[i].nInputs, operations[i].size);
         break;
       case OperationType::READ_REDUCE_COPY:
         dst = getBuffer(input, output, scratch, operations[i].dstBufferType);
         src = getBuffer(input, output, scratch, operations[i].srcBufferType);
         handleReadReduceCopySend(dst, operations[i].dstOffset, src, operations[i].srcOffset, smChannels,
                                  operations[i].outputChannelIndexes, operations[i].inputChannelIndexes,
-                                 operations[i].outputOffsets, operations[i].inputOffsets, operations[i].nOutputChannels,
-                                 operations[i].nInputChannels, operations[i].size, false);
+                                 operations[i].outputOffsets, operations[i].inputOffsets, operations[i].nOutputs,
+                                 operations[i].nInputs, operations[i].size, false);
         break;
       case OperationType::PUT_PACKET:
         handlePutPacket<PacketType>(operations[i].srcOffset, smChannels, operations[i].outputChannelIndexes,
-                                    operations[i].outputOffsets, operations[i].nOutputChannels, operations[i].size,
-                                    flag);
+                                    operations[i].outputOffsets, operations[i].nOutputs, operations[i].size, flag);
         break;
       case OperationType::REDUCE_SEND_PACKET:
         dst = getBuffer(input, output, scratch, operations[i].dstBufferType);
         src = getBuffer(input, output, scratch, operations[i].srcBufferType);
         handleReduceSendPacket<T, PacketType>(dst, operations[i].dstOffset, src, operations[i].srcOffset, smChannels,
                                               operations[i].outputChannelIndexes, operations[i].outputOffsets,
-                                              operations[i].inputOffsets, operations[i].nOutputChannels,
-                                              operations[i].nInputChannels, operations[i].size, flag);
+                                              operations[i].inputOffsets, operations[i].nOutputs, operations[i].nInputs,
+                                              operations[i].size, flag);
         break;
       case OperationType::COPY_PACKET:
         dst = getBuffer(input, output, scratch, operations[i].dstBufferType);

From d52ef41ba5b1836fd812ebd1776087fb82970608 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Sun, 7 Apr 2024 13:47:36 +0000
Subject: [PATCH 35/51] WIP

---
 src/executor/execution_plan.cc             |  18 +
 src/include/execution_common.hpp           |   1 +
 test/execution-files/allreduce_packet.json | 636 ++++++++++-----------
 3 files changed, 337 insertions(+), 318 deletions(-)

diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc
index 4c75797bb..29bd28e07 100644
--- a/src/executor/execution_plan.cc
+++ b/src/executor/execution_plan.cc
@@ -37,6 +37,12 @@ auto getOpType = [](const std::string& str) {
     return mscclpp::OperationType::READ_REDUCE_COPY;
   } else if (str == "rrcs") {
     return mscclpp::OperationType::READ_REDUCE_COPY_SEND;
+  } else if (str == "ppkt") {
+    return mscclpp::OperationType::PUT_PACKET;
+  } else if (str == "rspkt") {
+    return mscclpp::OperationType::REDUCE_SEND_PACKET;
+  } else if (str == "cpkt") {
+    return mscclpp::OperationType::COPY_PACKET;
   } else {
     throw std::runtime_error("Invalid operation type");
   }
@@ -59,6 +65,8 @@ auto convertToChannelType = [](const std::string& str) {
     return mscclpp::ChannelType::SM;
   } else if (str == "proxy") {
     return mscclpp::ChannelType::PROXY;
+  } else if (str == "none") {
+    return mscclpp::ChannelType::NONE;
   } else {
     throw std::runtime_error("Invalid channel type");
   }
@@ -99,6 +107,9 @@ std::vector<BufferType> ExecutionPlan::Impl::getConnectedBufferTypes(int rank) c
   return std::vector<BufferType>(bufferTypes.begin(), bufferTypes.end());
 }
 size_t ExecutionPlan::Impl::getScratchBufferSize(int rank, size_t inputSize) const {
+  if (this->isUsingPacket) {
+    return inputSize / this->inputChunks.at(rank) * this->scratchChunks.at(rank) * 2;
+  }
   return inputSize / this->inputChunks.at(rank) * this->scratchChunks.at(rank);
 }
 std::vector<Operation> ExecutionPlan::Impl::getOperations(int rank, int threadblock) const {
@@ -221,6 +232,13 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus) {
               channelIndexes[{srcBufferType, dstBufferType, operation.channelType}][op["i_cids"][i]["id"]];
           operation.inputOffsets[i] = this->chunkSize * (int)op["i_cids"][i]["off"];
         }
+        // will have either srcs or i_cids
+        if (op.contains("srcs")) {
+          operation.nInputs = op["srcs"].size();
+        }
+        for (int i = 0; i < operation.nInputs; i++) {
+          operation.inputOffsets[i] = this->chunkSize * (int)op["srcs"][i]["off"];
+        }
         for (int i = 0; i < operation.nOutputs; i++) {
           BufferType srcBufferType = convertToBufferType(op["o_buff"]["src"]);
           BufferType dstBufferType = convertToBufferType(op["o_buff"]["dst"]);
diff --git a/src/include/execution_common.hpp b/src/include/execution_common.hpp
index ba61fb84c..2d03feb61 100644
--- a/src/include/execution_common.hpp
+++ b/src/include/execution_common.hpp
@@ -20,6 +20,7 @@ enum class BufferType : uint8_t {
 };
 
 enum class ChannelType : uint8_t {
+  NONE,
   SM,
   PROXY,
 };
diff --git a/test/execution-files/allreduce_packet.json b/test/execution-files/allreduce_packet.json
index 3eda0ff3d..7045f21c2 100644
--- a/test/execution-files/allreduce_packet.json
+++ b/test/execution-files/allreduce_packet.json
@@ -1,330 +1,330 @@
 {
-    "name": "allreduce_pairs",
-    "colletive": "allreduce",
-    "protocol": "LL",
-    "inplace": true,
-    "gpus": [
-      {
-        "id": 0,
-        "inputChunks": 4,
-        "outputChunks": 0,
-        "scratchChunks": 8,
-        "threadblocks": [
-          {
-            "id": 0,
-            "ops": [
-              {
-                "name": "ppkt",
-                "o_buff": {
-                  "src": "i",
-                  "dst": "s"
-                },
-                "o_cids": [
-                  {
-                    "id": 0,
-                    "off": 0
-                  }
-                ],
-                "src": 0,
-                "srcbuff": "i",
-                "srcoff": 2,
-                "ctype": "sm",
-                "cnt": 1
-              },
-              {
-                "name": "rspkt",
-                "o_buff": {
-                  "src": "i",
-                  "dst": "s"
-                },
-                "o_cids": [
-                  {
-                    "id": 0,
-                    "off": 4
-                  }
-                ],
-                "src": 0,
-                "srcs": [
-                  {
-                    "buff": "s",
-                    "off": 2
-                  }
-                ],
-                "srcbuff": "i",
-                "srcoff": 0,
-                "dst": 0,
-                "dstbuff": "i",
-                "dstoff": 0,
-                "ctype": "none",
-                "cnt": 1
+  "name": "allreduce_pairs",
+  "colletive": "allreduce",
+  "protocol": "LL",
+  "inplace": true,
+  "gpus": [
+    {
+      "id": 0,
+      "inputChunks": 4,
+      "outputChunks": 0,
+      "scratchChunks": 8,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "ppkt",
+              "o_buff": {
+                "src": "i",
+                "dst": "s"
               },
-              {
-                "name": "cpkt",
-                "src": 0,
-                "srcbuff": "s",
-                "srcoff": 6,
-                "dst": 0,
-                "dstbuff": "i",
-                "dstoff": 2,
-                "ctype": "none",
-                "cnt": 1
-              }
-            ],
-            "channels": [
-              {
+              "o_cids": [
+                {
+                  "id": 0,
+                  "off": 0
+                }
+              ],
+              "src": 0,
+              "srcbuff": "i",
+              "srcoff": 2,
+              "ctype": "sm",
+              "cnt": 1
+            },
+            {
+              "name": "rspkt",
+              "o_buff": {
                 "src": "i",
-                "dst": "s",
-                "ctype": "sm",
-                "cids": [
-                  0
-                ]
-              }
-            ]
-          },
-          {
-            "id": 1,
-            "ops": [
-              {
-                "name": "ppkt",
-                "o_buff": {
-                  "src": "i",
-                  "dst": "s"
-                },
-                "o_cids": [
-                  {
-                    "id": 0,
-                    "off": 1
-                  }
-                ],
-                "src": 0,
-                "srcbuff": "i",
-                "srcoff": 3,
-                "ctype": "sm",
-                "cnt": 1
+                "dst": "s"
               },
-              {
-                "name": "rspkt",
-                "o_buff": {
-                  "src": "i",
-                  "dst": "s"
-                },
-                "o_cids": [
-                  {
-                    "id": 0,
-                    "off": 5
-                  }
-                ],
-                "src": 0,
-                "srcs": [
-                  {
-                    "buff": "s",
-                    "off": 3
-                  }
-                ],
-                "srcbuff": "i",
-                "srcoff": 1,
-                "dst": 0,
-                "dstbuff": "i",
-                "dstoff": 1,
-                "ctype": "none",
-                "cnt": 1
+              "o_cids": [
+                {
+                  "id": 0,
+                  "off": 4
+                }
+              ],
+              "src": 0,
+              "srcs": [
+                {
+                  "buff": "s",
+                  "off": 2
+                }
+              ],
+              "srcbuff": "i",
+              "srcoff": 0,
+              "dst": 0,
+              "dstbuff": "i",
+              "dstoff": 0,
+              "ctype": "sm",
+              "cnt": 1
+            },
+            {
+              "name": "cpkt",
+              "src": 0,
+              "srcbuff": "s",
+              "srcoff": 6,
+              "dst": 0,
+              "dstbuff": "i",
+              "dstoff": 2,
+              "ctype": "none",
+              "cnt": 1
+            }
+          ],
+          "channels": [
+            {
+              "src": "i",
+              "dst": "s",
+              "ctype": "sm",
+              "cids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "ppkt",
+              "o_buff": {
+                "src": "i",
+                "dst": "s"
               },
-              {
-                "name": "cpkt",
-                "src": 0,
-                "srcbuff": "s",
-                "srcoff": 7,
-                "dst": 0,
-                "dstbuff": "i",
-                "dstoff": 3,
-                "ctype": "none",
-                "cnt": 1
-              }
-            ],
-            "channels": [
-              {
+              "o_cids": [
+                {
+                  "id": 0,
+                  "off": 1
+                }
+              ],
+              "src": 0,
+              "srcbuff": "i",
+              "srcoff": 3,
+              "ctype": "sm",
+              "cnt": 1
+            },
+            {
+              "name": "rspkt",
+              "o_buff": {
                 "src": "i",
-                "dst": "s",
-                "ctype": "sm",
-                "cids": [
-                  1
-                ]
-              }
-            ]
-          }
-        ],
-        "channels": [
-          {
-            "srcbuff": "i",
-            "dstbuff": "s",
-            "type": "sm",
-            "connectedTo": [
-              1,
-              1
-            ]
-          }
-        ]
-      },
-      {
-        "id": 1,
-        "inputChunks": 4,
-        "outputChunks": 0,
-        "scratchChunks": 6,
-        "threadblocks": [
-          {
-            "id": 0,
-            "ops": [
-              {
-                "name": "ppkt",
-                "o_buff": {
-                  "src": "i",
-                  "dst": "s"
-                },
-                "o_cids": [
-                  {
-                    "id": 0,
-                    "off": 2
-                  }
-                ],
-                "src": 1,
-                "srcbuff": "i",
-                "srcoff": 0,
-                "ctype": "sm",
-                "cnt": 1
+                "dst": "s"
               },
-              {
-                "name": "rspkt",
-                "o_buff": {
-                  "src": "i",
-                  "dst": "s"
-                },
-                "o_cids": [
-                  {
-                    "id": 0,
-                    "off": 6
-                  }
-                ],
-                "src": 1,
-                "srcs": [
-                  {
-                    "buff": "s",
-                    "off": 0
-                  }
-                ],
-                "srcbuff": "i",
-                "srcoff": 2,
-                "dst": 1,
-                "dstbuff": "i",
-                "dstoff": 2,
-                "ctype": "none",
-                "cnt": 1
+              "o_cids": [
+                {
+                  "id": 0,
+                  "off": 5
+                }
+              ],
+              "src": 0,
+              "srcs": [
+                {
+                  "buff": "s",
+                  "off": 3
+                }
+              ],
+              "srcbuff": "i",
+              "srcoff": 1,
+              "dst": 0,
+              "dstbuff": "i",
+              "dstoff": 1,
+              "ctype": "sm",
+              "cnt": 1
+            },
+            {
+              "name": "cpkt",
+              "src": 0,
+              "srcbuff": "s",
+              "srcoff": 7,
+              "dst": 0,
+              "dstbuff": "i",
+              "dstoff": 3,
+              "ctype": "none",
+              "cnt": 1
+            }
+          ],
+          "channels": [
+            {
+              "src": "i",
+              "dst": "s",
+              "ctype": "sm",
+              "cids": [
+                1
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "srcbuff": "i",
+          "dstbuff": "s",
+          "type": "sm",
+          "connectedTo": [
+            1,
+            1
+          ]
+        }
+      ]
+    },
+    {
+      "id": 1,
+      "inputChunks": 4,
+      "outputChunks": 0,
+      "scratchChunks": 6,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "ppkt",
+              "o_buff": {
+                "src": "i",
+                "dst": "s"
               },
-              {
-                "name": "cpkt",
-                "src": 1,
-                "srcbuff": "s",
-                "srcoff": 4,
-                "dst": 1,
-                "dstbuff": "i",
-                "dstoff": 0,
-                "ctype": "none",
-                "cnt": 1
-              }
-            ],
-            "channels": [
-              {
+              "o_cids": [
+                {
+                  "id": 0,
+                  "off": 2
+                }
+              ],
+              "src": 1,
+              "srcbuff": "i",
+              "srcoff": 0,
+              "ctype": "sm",
+              "cnt": 1
+            },
+            {
+              "name": "rspkt",
+              "o_buff": {
                 "src": "i",
-                "dst": "s",
-                "ctype": "sm",
-                "cids": [
-                  0
-                ]
-              }
-            ]
-          },
-          {
-            "id": 1,
-            "ops": [
-              {
-                "name": "ppkt",
-                "o_buff": {
-                  "src": "i",
-                  "dst": "s"
-                },
-                "o_cids": [
-                  {
-                    "id": 0,
-                    "off": 3
-                  }
-                ],
-                "src": 1,
-                "srcbuff": "i",
-                "srcoff": 1,
-                "ctype": "sm",
-                "cnt": 1
+                "dst": "s"
               },
-              {
-                "name": "rspkt",
-                "o_buff": {
-                  "src": "i",
-                  "dst": "s"
-                },
-                "o_cids": [
-                  {
-                    "id": 0,
-                    "off": 7
-                  }
-                ],
-                "src": 1,
-                "srcs": [
-                  {
-                    "buff": "s",
-                    "off": 1
-                  }
-                ],
-                "srcbuff": "i",
-                "srcoff": 3,
-                "dst": 1,
-                "dstbuff": "i",
-                "dstoff": 3,
-                "ctype": "none",
-                "cnt": 1
+              "o_cids": [
+                {
+                  "id": 0,
+                  "off": 6
+                }
+              ],
+              "src": 1,
+              "srcs": [
+                {
+                  "buff": "s",
+                  "off": 0
+                }
+              ],
+              "srcbuff": "i",
+              "srcoff": 2,
+              "dst": 1,
+              "dstbuff": "i",
+              "dstoff": 2,
+              "ctype": "sm",
+              "cnt": 1
+            },
+            {
+              "name": "cpkt",
+              "src": 1,
+              "srcbuff": "s",
+              "srcoff": 4,
+              "dst": 1,
+              "dstbuff": "i",
+              "dstoff": 0,
+              "ctype": "none",
+              "cnt": 1
+            }
+          ],
+          "channels": [
+            {
+              "src": "i",
+              "dst": "s",
+              "ctype": "sm",
+              "cids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "ppkt",
+              "o_buff": {
+                "src": "i",
+                "dst": "s"
               },
-              {
-                "name": "cpkt",
-                "src": 1,
-                "srcbuff": "s",
-                "srcoff": 5,
-                "dst": 1,
-                "dstbuff": "i",
-                "dstoff": 1,
-                "ctype": "none",
-                "cnt": 1
-              }
-            ],
-            "channels": [
-              {
+              "o_cids": [
+                {
+                  "id": 0,
+                  "off": 3
+                }
+              ],
+              "src": 1,
+              "srcbuff": "i",
+              "srcoff": 1,
+              "ctype": "sm",
+              "cnt": 1
+            },
+            {
+              "name": "rspkt",
+              "o_buff": {
                 "src": "i",
-                "dst": "s",
-                "ctype": "sm",
-                "cids": [
-                  1
-                ]
-              }
-            ]
-          }
-        ],
-        "channels": [
-          {
-            "srcbuff": "i",
-            "dstbuff": "s",
-            "type": "sm",
-            "connectedTo": [
-              0,
-              0
-            ]
-          }
-        ]
-      }
-    ]
-  }
+                "dst": "s"
+              },
+              "o_cids": [
+                {
+                  "id": 0,
+                  "off": 7
+                }
+              ],
+              "src": 1,
+              "srcs": [
+                {
+                  "buff": "s",
+                  "off": 1
+                }
+              ],
+              "srcbuff": "i",
+              "srcoff": 3,
+              "dst": 1,
+              "dstbuff": "i",
+              "dstoff": 3,
+              "ctype": "sm",
+              "cnt": 1
+            },
+            {
+              "name": "cpkt",
+              "src": 1,
+              "srcbuff": "s",
+              "srcoff": 5,
+              "dst": 1,
+              "dstbuff": "i",
+              "dstoff": 1,
+              "ctype": "none",
+              "cnt": 1
+            }
+          ],
+          "channels": [
+            {
+              "src": "i",
+              "dst": "s",
+              "ctype": "sm",
+              "cids": [
+                1
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "srcbuff": "i",
+          "dstbuff": "s",
+          "type": "sm",
+          "connectedTo": [
+            0,
+            0
+          ]
+        }
+      ]
+    }
+  ]
+}

From b03be9ab3591631d8d0ac4429cf81cea1381af69 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Sun, 7 Apr 2024 14:42:21 +0000
Subject: [PATCH 36/51] WIP

---
 src/executor/execution_plan.cc   |  1 +
 src/include/execution_common.hpp |  5 ++++-
 src/include/execution_kernel.hpp | 38 +++++++++++++++++---------------
 test/executor_test.cc            |  2 +-
 4 files changed, 26 insertions(+), 20 deletions(-)

diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc
index 29bd28e07..55123d6b0 100644
--- a/src/executor/execution_plan.cc
+++ b/src/executor/execution_plan.cc
@@ -235,6 +235,7 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus) {
         // will have either srcs or i_cids
         if (op.contains("srcs")) {
           operation.nInputs = op["srcs"].size();
+          operation.inputBufferType = convertToBufferType(op["srcs"][0]["buff"]);
         }
         for (int i = 0; i < operation.nInputs; i++) {
           operation.inputOffsets[i] = this->chunkSize * (int)op["srcs"][i]["off"];
diff --git a/src/include/execution_common.hpp b/src/include/execution_common.hpp
index 2d03feb61..685317268 100644
--- a/src/include/execution_common.hpp
+++ b/src/include/execution_common.hpp
@@ -55,7 +55,10 @@ struct Operation {
   BufferType dstBufferType;
   uint8_t nInputs;
   uint8_t nOutputs;
-  uint8_t inputChannelIndexes[MAX_CHANNEL_PER_OPERATION];
+  union {
+    uint8_t inputChannelIndexes[MAX_CHANNEL_PER_OPERATION];
+    BufferType inputBufferType;
+  };
   uint8_t outputChannelIndexes[MAX_CHANNEL_PER_OPERATION];
   uint32_t inputOffsets[MAX_CHANNEL_PER_OPERATION];
   uint32_t outputOffsets[MAX_CHANNEL_PER_OPERATION];
diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp
index e2ceaf224..a026905aa 100644
--- a/src/include/execution_kernel.hpp
+++ b/src/include/execution_kernel.hpp
@@ -211,36 +211,36 @@ MSCCLPP_DEVICE_INLINE void handlePutPacket(uint32_t inputOffsetByBytes, DeviceHa
                                            uint8_t* dstChannelIndexes, uint32_t* dstOffsets, int nDstChannels,
                                            uint32_t size, uint32_t flag) {
   for (int index = 0; index < nDstChannels; ++index) {
-    smChannels[dstChannelIndexes[index]].putPackets<PacketType>(
-        dstOffsets[index] * sizeof(PacketType), inputOffsetByBytes, size, threadIdx.x, blockDim.x, flag);
+    smChannels[dstChannelIndexes[index]].putPackets<PacketType>(dstOffsets[index] * 2, inputOffsetByBytes, size,
+                                                                threadIdx.x, blockDim.x, flag);
   }
 }
 
 template <typename T, typename PacketType>
-MSCCLPP_DEVICE_INLINE void handleReduceSendPacket(T* output, uint32_t outputOffsetByBytes, T* input,
-                                                  uint32_t inputOffsetByBytes, DeviceHandle<SmChannel>* smChannels,
-                                                  uint8_t* dstChannelIndexes, uint32_t* dstOffsets,
-                                                  uint32_t* srcOffsets, int nDstChannels, int nSrcs, size_t size,
+MSCCLPP_DEVICE_INLINE void handleReduceSendPacket(T* dst, uint32_t dstOffsetByBytes, T* src, uint32_t srcOffsetByBytes,
+                                                  T* inputBuff, uint32_t* inputOffsets, int nSrcs,
+                                                  DeviceHandle<SmChannel>* smChannels, uint8_t* outputChannelIndexes,
+                                                  uint32_t* outputOffsets, int nDstChannels, size_t size,
                                                   uint32_t flag) {
   size_t nPackets = size * 2 / sizeof(PacketType);
-  const uint32_t srcOffset = inputOffsetByBytes / sizeof(PacketValType<PacketType>);
-  const uint32_t dstOffset = outputOffsetByBytes / sizeof(PacketValType<PacketType>);
-  PacketValType<PacketType>* src = (PacketValType<PacketType>*)input + srcOffset;
-  PacketValType<PacketType>* dst = (PacketValType<PacketType>*)output + dstOffset;
+  const uint32_t srcOffset = dstOffsetByBytes / sizeof(PacketValType<PacketType>);
+  const uint32_t dstOffset = dstOffsetByBytes / sizeof(PacketValType<PacketType>);
+  PacketValType<PacketType>* srcPacketValue = (PacketValType<PacketType>*)src + srcOffset;
+  PacketValType<PacketType>* dstPacketValue = (PacketValType<PacketType>*)dst + dstOffset;
   for (size_t idx = threadIdx.x; idx < nPackets; idx += blockDim.x) {
     PacketValType<PacketType> data = {};
     for (int index = 0; index < nSrcs; ++index) {
-      PacketType* pkt = (PacketType*)((char*)input + 2 * srcOffsets[index]);
+      PacketType* pkt = (PacketType*)((char*)inputBuff + 2 * inputOffsets[index]);
       PacketValType<PacketType> val = pkt[idx].read(flag);
       data = add_vectors<T>(data, val);
     }
-    data = add_vectors<T>(data, src[idx]);
-    dst[idx] = data;
+    data = add_vectors<T>(data, srcPacketValue[idx]);
+    dstPacketValue[idx] = data;
 
     PacketType pkt(data, flag);
     for (int index = 0; index < nDstChannels; ++index) {
-      size_t offset = (dstOffsets[index] * 2) / sizeof(PacketType);
-      smChannels[dstChannelIndexes[index]].write(offset + idx, pkt);
+      size_t offset = (outputOffsets[index] * 2) / sizeof(PacketType);
+      smChannels[outputChannelIndexes[index]].write(offset + idx, pkt);
     }
   }
 }
@@ -277,6 +277,7 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu
   DeviceHandle<SimpleProxyChannel>* proxyChannels = localPlan->channels.proxyChannels;
   T* src = nullptr;
   T* dst = nullptr;
+  T* tmp = nullptr;
   for (int i = 0; i < localPlan->nOperations; i++) {
     switch (operations[i].type) {
       case OperationType::BARRIER:
@@ -317,10 +318,11 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu
       case OperationType::REDUCE_SEND_PACKET:
         dst = getBuffer(input, output, scratch, operations[i].dstBufferType);
         src = getBuffer(input, output, scratch, operations[i].srcBufferType);
-        handleReduceSendPacket<T, PacketType>(dst, operations[i].dstOffset, src, operations[i].srcOffset, smChannels,
+        tmp = getBuffer(input, output, scratch, operations[i].inputBufferType);
+        handleReduceSendPacket<T, PacketType>(dst, operations[i].dstOffset, src, operations[i].srcOffset, tmp,
+                                              operations[i].inputOffsets, operations[i].nInputs, smChannels,
                                               operations[i].outputChannelIndexes, operations[i].outputOffsets,
-                                              operations[i].inputOffsets, operations[i].nOutputs, operations[i].nInputs,
-                                              operations[i].size, flag);
+                                              operations[i].nOutputs, operations[i].size, flag);
         break;
       case OperationType::COPY_PACKET:
         dst = getBuffer(input, output, scratch, operations[i].dstBufferType);
diff --git a/test/executor_test.cc b/test/executor_test.cc
index c708a97d3..4a8822a99 100644
--- a/test/executor_test.cc
+++ b/test/executor_test.cc
@@ -29,7 +29,7 @@ int main() {
   MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
   bootstrap->initialize(id);
   // sleep 20s
-  std::this_thread::sleep_for(std::chrono::seconds(20));
+  // std::this_thread::sleep_for(std::chrono::seconds(20));
   auto comm = std::make_shared<mscclpp::Communicator>(bootstrap);
   CUDACHECK(cudaSetDevice(rank));
 

From 695ff9449a5de7b303bd6bfb1d1890daa2dd85d4 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Mon, 8 Apr 2024 03:29:39 +0000
Subject: [PATCH 37/51] update

---
 include/mscclpp/executor.hpp   |  2 +-
 python/mscclpp/executor.cpp    |  3 ++-
 python/test/executor_test.py   | 38 ++++++++++++++++++++++++++-
 src/executor/execution_plan.cc | 48 ++++++++++++++++++----------------
 src/executor/executor.cc       | 10 +++----
 src/include/execution_plan.hpp |  6 ++---
 test/executor_test.cc          |  2 +-
 7 files changed, 74 insertions(+), 35 deletions(-)

diff --git a/include/mscclpp/executor.hpp b/include/mscclpp/executor.hpp
index 60a68fbb2..f54c80585 100644
--- a/include/mscclpp/executor.hpp
+++ b/include/mscclpp/executor.hpp
@@ -24,7 +24,7 @@ enum class PacketType {
 
 class ExecutionPlan {
  public:
-  ExecutionPlan(std::string planPath);
+  ExecutionPlan(const std::string name, const std::string planPath);
   ~ExecutionPlan() = default;
 
  private:
diff --git a/python/mscclpp/executor.cpp b/python/mscclpp/executor.cpp
index f57a4294b..9b411ba8a 100644
--- a/python/mscclpp/executor.cpp
+++ b/python/mscclpp/executor.cpp
@@ -20,7 +20,8 @@ void register_executor(nb::module_& m) {
 
   nb::enum_<PacketType>(m, "PacketType").value("LL8", PacketType::LL8).value("LL16", PacketType::LL16);
 
-  nb::class_<ExecutionPlan>(m, "ExecutionPlan").def(nb::init<std::string>(), nb::arg("planPath"));
+  nb::class_<ExecutionPlan>(m, "ExecutionPlan")
+      .def(nb::init<const std::string, const std::string>(), nb::arg("name"), nb::arg("planPath"));
 
   nb::class_<Executor>(m, "Executor")
       .def(nb::init<std::shared_ptr<Communicator>, int>(), nb::arg("comm"), nb::arg("nranksPerNode"))
diff --git a/python/test/executor_test.py b/python/test/executor_test.py
index a777c3546..3cef5a318 100644
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -11,6 +11,29 @@
 
 MSCCLPP_ROOT_PATH = "/root/mscclpp"
 
+def bench_time(niters: int, func):
+    # capture cuda graph for niters of the kernel launch
+    stream = cp.cuda.Stream(non_blocking=True)
+    with stream:
+        stream.begin_capture()
+        for i in range(niters):
+            func(stream)
+        graph = stream.end_capture()
+
+    # now run a warm up round
+    graph.launch(stream)
+
+    # now run the benchmark and measure time
+    start = cp.cuda.Event()
+    end = cp.cuda.Event()
+
+    start.record(stream)
+    graph.launch(stream)
+    end.record(stream)
+    end.synchronize()
+
+    return cp.cuda.get_elapsed_time(start, end) / niters * 1000.0
+
 if __name__ == "__main__":
     shm_comm = MPI.COMM_WORLD.Split_type(MPI.COMM_TYPE_SHARED, 0, MPI.INFO_NULL)
     N_GPUS_PER_NODE = shm_comm.size
@@ -19,7 +42,7 @@
     cp.cuda.Device(MPI.COMM_WORLD.rank % N_GPUS_PER_NODE).use()
     mscclpp_group = mscclpp_comm.CommGroup(MPI.COMM_WORLD)
     executor = Executor(mscclpp_group.communicator, N_GPUS_PER_NODE)
-    execution_plan = ExecutionPlan(path.join(MSCCLPP_ROOT_PATH, "test", "execution-files", "allreduce.json"))
+    execution_plan = ExecutionPlan("allreduce_pairs", path.join(MSCCLPP_ROOT_PATH, "test", "execution-files", "allreduce.json"))
 
     nelems = 1024 * 1024
     cp.random.seed(42)
@@ -45,5 +68,18 @@
     )
     stream.synchronize()
     assert cp.allclose(sendbuf, expected, atol=1e-3 * MPI.COMM_WORLD.size)
+
+    execution_time = bench_time(1000, lambda stream: executor.execute(
+        MPI.COMM_WORLD.rank,
+        sendbuf.data.ptr,
+        sendbuf.data.ptr,
+        sendbuf.nbytes,
+        sendbuf.nbytes,
+        DataType.float16,
+        512,
+        execution_plan,
+        stream.ptr,
+    ))
+    print(f"Execution time: {execution_time} us, data size: {sendbuf.nbytes} bytes")
     executor = None
     mscclpp_group = None
diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc
index 55123d6b0..fb3f3a027 100644
--- a/src/executor/execution_plan.cc
+++ b/src/executor/execution_plan.cc
@@ -77,7 +77,8 @@ auto convertToChannelType = [](const std::string& str) {
 namespace mscclpp {
 using json = nlohmann::json;
 
-ExecutionPlan::Impl::Impl(std::string planPath) : planPath(planPath), isUsingPacket(false) {}
+ExecutionPlan::Impl::Impl(const std::string name, const std::string planPath)
+    : name(name), planPath(planPath), isUsingPacket(false) {}
 
 std::vector<ChannelInfo> ExecutionPlan::Impl::getChannelInfos(int rank, ChannelType channelType) const {
   auto pred = [channelType](const ChannelInfo& info) { return info.channelType == channelType; };
@@ -121,7 +122,9 @@ int ExecutionPlan::Impl::getThreadblockCount(int rank) const { return this->oper
 void ExecutionPlan::Impl::loadExecutionPlan(size_t inputSize) {
   std::ifstream file(this->planPath);
   json obj = json::parse(file);
-  this->name = obj["name"];
+  if (this->name != obj["name"]) {
+    throw std::runtime_error("Plan name does not match");
+  }
   std::string protocol = obj["protocol"];
   if (protocol == "LL") {
     this->isUsingPacket = true;
@@ -221,31 +224,31 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus) {
         }
         if (op.contains("i_cids")) {
           operation.nInputs = op["i_cids"].size();
-        }
-        if (op.contains("o_cids")) {
-          operation.nOutputs = op["o_cids"].size();
-        }
-        for (int i = 0; i < operation.nInputs; i++) {
-          BufferType srcBufferType = convertToBufferType(op["i_buff"]["src"]);
-          BufferType dstBufferType = convertToBufferType(op["i_buff"]["dst"]);
-          operation.inputChannelIndexes[i] =
-              channelIndexes[{srcBufferType, dstBufferType, operation.channelType}][op["i_cids"][i]["id"]];
-          operation.inputOffsets[i] = this->chunkSize * (int)op["i_cids"][i]["off"];
+          for (int i = 0; i < operation.nInputs; i++) {
+            BufferType srcBufferType = convertToBufferType(op["i_buff"]["src"]);
+            BufferType dstBufferType = convertToBufferType(op["i_buff"]["dst"]);
+            operation.inputChannelIndexes[i] =
+                channelIndexes[{srcBufferType, dstBufferType, operation.channelType}][op["i_cids"][i]["id"]];
+            operation.inputOffsets[i] = this->chunkSize * (int)op["i_cids"][i]["off"];
+          }
         }
         // will have either srcs or i_cids
         if (op.contains("srcs")) {
           operation.nInputs = op["srcs"].size();
           operation.inputBufferType = convertToBufferType(op["srcs"][0]["buff"]);
+          for (int i = 0; i < operation.nInputs; i++) {
+            operation.inputOffsets[i] = this->chunkSize * (int)op["srcs"][i]["off"];
+          }
         }
-        for (int i = 0; i < operation.nInputs; i++) {
-          operation.inputOffsets[i] = this->chunkSize * (int)op["srcs"][i]["off"];
-        }
-        for (int i = 0; i < operation.nOutputs; i++) {
-          BufferType srcBufferType = convertToBufferType(op["o_buff"]["src"]);
-          BufferType dstBufferType = convertToBufferType(op["o_buff"]["dst"]);
-          operation.outputChannelIndexes[i] =
-              channelIndexes[{srcBufferType, dstBufferType, operation.channelType}][op["o_cids"][i]["id"]];
-          operation.outputOffsets[i] = this->chunkSize * (int)op["o_cids"][i]["off"];
+        if (op.contains("o_cids")) {
+          operation.nOutputs = op["o_cids"].size();
+          for (int i = 0; i < operation.nOutputs; i++) {
+            BufferType srcBufferType = convertToBufferType(op["o_buff"]["src"]);
+            BufferType dstBufferType = convertToBufferType(op["o_buff"]["dst"]);
+            operation.outputChannelIndexes[i] =
+                channelIndexes[{srcBufferType, dstBufferType, operation.channelType}][op["o_cids"][i]["id"]];
+            operation.outputOffsets[i] = this->chunkSize * (int)op["o_cids"][i]["off"];
+          }
         }
         if (op.contains("srcbuff")) {
           operation.srcBufferType = convertToBufferType(op["srcbuff"]);
@@ -269,6 +272,7 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus) {
   }
 }
 
-ExecutionPlan::ExecutionPlan(std::string planPath) : impl_(std::make_shared<Impl>(planPath)) {}
+ExecutionPlan::ExecutionPlan(const std::string name, const std::string planPath)
+    : impl_(std::make_shared<Impl>(name, planPath)) {}
 
 }  // namespace mscclpp
diff --git a/src/executor/executor.cc b/src/executor/executor.cc
index d775cd593..62bf33806 100644
--- a/src/executor/executor.cc
+++ b/src/executor/executor.cc
@@ -78,7 +78,7 @@ struct Executor::Impl {
   ~Impl() = default;
 
   ExecutionContext setupExecutionContext(int rank, void* sendbuff, void* recvbuff, size_t sendBufferSize,
-                                         size_t recvBufferSize, const ExecutionPlan& plan, cudaStream_t stream) {
+                                         size_t recvBufferSize, const ExecutionPlan& plan) {
     ExecutionContextKey key = {sendbuff, recvbuff, sendBufferSize, recvBufferSize, plan.impl_->name};
     if (this->contexts.find(key) != this->contexts.end()) {
       return this->contexts[key];
@@ -96,10 +96,8 @@ struct Executor::Impl {
     this->setupDeviceExecutionPlan(context, rank, plan);
     context.deviceExecutionPlansBuffer =
         allocExtSharedCuda<char>(context.deviceExecutionPlans.size() * sizeof(DeviceExecutionPlan));
-    MSCCLPP_CUDATHROW(cudaMemcpyAsync(context.deviceExecutionPlansBuffer.get(), context.deviceExecutionPlans.data(),
-                                      context.deviceExecutionPlans.size() * sizeof(DeviceExecutionPlan),
-                                      cudaMemcpyHostToDevice, stream));
-    MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
+    memcpyCuda(context.deviceExecutionPlansBuffer.get(), (char*)context.deviceExecutionPlans.data(),
+               context.deviceExecutionPlans.size() * sizeof(DeviceExecutionPlan), cudaMemcpyHostToDevice);
     this->contexts.insert({key, context});
     return context;
   }
@@ -286,7 +284,7 @@ void Executor::execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuff
                        DataType dataType, int nthreads, const ExecutionPlan& plan, cudaStream_t stream,
                        PacketType packetType) {
   ExecutionContext context =
-      this->impl_->setupExecutionContext(rank, sendbuff, recvBuff, sendBuffSize, recvBuffSize, plan, stream);
+      this->impl_->setupExecutionContext(rank, sendbuff, recvBuff, sendBuffSize, recvBuffSize, plan);
   this->impl_->launchKernel(context, rank, nthreads, sendbuff, recvBuff, dataType, stream, packetType);
 }
 
diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp
index 8c0029f0a..450bb4f55 100644
--- a/src/include/execution_plan.hpp
+++ b/src/include/execution_plan.hpp
@@ -46,7 +46,7 @@ struct ChannelInfo {
 
 struct ExecutionPlan::Impl {
  public:
-  Impl(std::string planPath);
+  Impl(const std::string name, const std::string planPath);
   ~Impl() = default;
 
   std::vector<ChannelInfo> getChannelInfos(int rank, ChannelType channelType) const;
@@ -61,7 +61,8 @@ struct ExecutionPlan::Impl {
   void setupChannels(const nlohmann::json& gpus);
   void setupOperations(const nlohmann::json& gpus);
 
-  std::string planPath;
+  const std::string name;
+  const std::string planPath;
   bool isUsingPacket;
   // operations for [rank][threadblock] = [operations]
   std::unordered_map<int, std::vector<std::vector<Operation>>> operations;
@@ -69,7 +70,6 @@ struct ExecutionPlan::Impl {
   // threadblockChannelMap[rank][threadblock] = [channelIndex]
   std::unordered_map<int, std::vector<std::vector<std::pair<int, ChannelKey>>>> threadblockSMChannelMap;
   std::unordered_map<int, std::vector<std::vector<std::pair<int, ChannelKey>>>> threadblockProxyChannelMap;
-  std::string name;
   std::unordered_map<int, uint32_t> inputChunks;
   std::unordered_map<int, uint32_t> outputChunks;
   std::unordered_map<int, uint32_t> scratchChunks;
diff --git a/test/executor_test.cc b/test/executor_test.cc
index 4a8822a99..213b3cdb6 100644
--- a/test/executor_test.cc
+++ b/test/executor_test.cc
@@ -34,7 +34,7 @@ int main() {
   CUDACHECK(cudaSetDevice(rank));
 
   std::shared_ptr<mscclpp::Executor> executor = std::make_shared<mscclpp::Executor>(comm, 8 /*nranksPerNode*/);
-  mscclpp::ExecutionPlan plan(MSCCLPP_ROOT_PATH + "/test/execution-files/allreduce_packet.json");
+  mscclpp::ExecutionPlan plan("allreduce_pairs", MSCCLPP_ROOT_PATH + "/test/execution-files/allreduce.json");
   const int bufferSize = 1024 * 1024;
   std::shared_ptr<char> sendbuff = mscclpp::allocExtSharedCuda<char>(bufferSize);
   mscclpp::CudaStreamWithFlags stream(cudaStreamNonBlocking);

From 5f37c0aa94d86021af3ea311ad3c385c77c681f8 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Mon, 8 Apr 2024 03:31:44 +0000
Subject: [PATCH 38/51] lint

---
 python/test/executor_test.py | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/python/test/executor_test.py b/python/test/executor_test.py
index 3cef5a318..395d089b8 100644
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -11,6 +11,7 @@
 
 MSCCLPP_ROOT_PATH = "/root/mscclpp"
 
+
 def bench_time(niters: int, func):
     # capture cuda graph for niters of the kernel launch
     stream = cp.cuda.Stream(non_blocking=True)
@@ -34,6 +35,7 @@ def bench_time(niters: int, func):
 
     return cp.cuda.get_elapsed_time(start, end) / niters * 1000.0
 
+
 if __name__ == "__main__":
     shm_comm = MPI.COMM_WORLD.Split_type(MPI.COMM_TYPE_SHARED, 0, MPI.INFO_NULL)
     N_GPUS_PER_NODE = shm_comm.size
@@ -42,7 +44,9 @@ def bench_time(niters: int, func):
     cp.cuda.Device(MPI.COMM_WORLD.rank % N_GPUS_PER_NODE).use()
     mscclpp_group = mscclpp_comm.CommGroup(MPI.COMM_WORLD)
     executor = Executor(mscclpp_group.communicator, N_GPUS_PER_NODE)
-    execution_plan = ExecutionPlan("allreduce_pairs", path.join(MSCCLPP_ROOT_PATH, "test", "execution-files", "allreduce.json"))
+    execution_plan = ExecutionPlan(
+        "allreduce_pairs", path.join(MSCCLPP_ROOT_PATH, "test", "execution-files", "allreduce.json")
+    )
 
     nelems = 1024 * 1024
     cp.random.seed(42)
@@ -69,17 +73,20 @@ def bench_time(niters: int, func):
     stream.synchronize()
     assert cp.allclose(sendbuf, expected, atol=1e-3 * MPI.COMM_WORLD.size)
 
-    execution_time = bench_time(1000, lambda stream: executor.execute(
-        MPI.COMM_WORLD.rank,
-        sendbuf.data.ptr,
-        sendbuf.data.ptr,
-        sendbuf.nbytes,
-        sendbuf.nbytes,
-        DataType.float16,
-        512,
-        execution_plan,
-        stream.ptr,
-    ))
+    execution_time = bench_time(
+        1000,
+        lambda stream: executor.execute(
+            MPI.COMM_WORLD.rank,
+            sendbuf.data.ptr,
+            sendbuf.data.ptr,
+            sendbuf.nbytes,
+            sendbuf.nbytes,
+            DataType.float16,
+            512,
+            execution_plan,
+            stream.ptr,
+        ),
+    )
     print(f"Execution time: {execution_time} us, data size: {sendbuf.nbytes} bytes")
     executor = None
     mscclpp_group = None

From 78c56650cef1d67077589c4305b4aca48709c787 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Mon, 8 Apr 2024 03:45:53 +0000
Subject: [PATCH 39/51] fix

---
 src/include/execution_kernel.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp
index a026905aa..ae0c40e26 100644
--- a/src/include/execution_kernel.hpp
+++ b/src/include/execution_kernel.hpp
@@ -223,7 +223,7 @@ MSCCLPP_DEVICE_INLINE void handleReduceSendPacket(T* dst, uint32_t dstOffsetByBy
                                                   uint32_t* outputOffsets, int nDstChannels, size_t size,
                                                   uint32_t flag) {
   size_t nPackets = size * 2 / sizeof(PacketType);
-  const uint32_t srcOffset = dstOffsetByBytes / sizeof(PacketValType<PacketType>);
+  const uint32_t srcOffset = srcOffsetByBytes / sizeof(PacketValType<PacketType>);
   const uint32_t dstOffset = dstOffsetByBytes / sizeof(PacketValType<PacketType>);
   PacketValType<PacketType>* srcPacketValue = (PacketValType<PacketType>*)src + srcOffset;
   PacketValType<PacketType>* dstPacketValue = (PacketValType<PacketType>*)dst + dstOffset;

From 309f8f2db636c6920b40aeeaa94e8e3b6b45ef9a Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Tue, 9 Apr 2024 07:15:47 +0000
Subject: [PATCH 40/51] add test

---
 python/test/executor_test.py | 20 +-----------------
 python/test/test_mscclpp.py  | 41 +++++++++++++++++++++++++++++++++++-
 2 files changed, 41 insertions(+), 20 deletions(-)

diff --git a/python/test/executor_test.py b/python/test/executor_test.py
index 395d089b8..c4cd0a87c 100644
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -53,25 +53,7 @@ def bench_time(niters: int, func):
     buffer = cp.random.random(nelems).astype(cp.float16)
     sub_arrays = cp.split(buffer, MPI.COMM_WORLD.size)
     sendbuf = sub_arrays[MPI.COMM_WORLD.rank]
-
-    expected = cp.zeros_like(sendbuf)
-    for i in range(MPI.COMM_WORLD.size):
-        expected += sub_arrays[i]
-
-    stream = cp.cuda.Stream(non_blocking=True)
-    executor.execute(
-        MPI.COMM_WORLD.rank,
-        sendbuf.data.ptr,
-        sendbuf.data.ptr,
-        sendbuf.nbytes,
-        sendbuf.nbytes,
-        DataType.float16,
-        512,
-        execution_plan,
-        stream.ptr,
-    )
-    stream.synchronize()
-    assert cp.allclose(sendbuf, expected, atol=1e-3 * MPI.COMM_WORLD.size)
+    mscclpp_group.barrier()
 
     execution_time = bench_time(
         1000,
diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py
index 4b3cb6ebf..f007718e3 100644
--- a/python/test/test_mscclpp.py
+++ b/python/test/test_mscclpp.py
@@ -12,7 +12,10 @@
 import pytest
 
 from mscclpp import (
+    DataType,
     EndpointConfig,
+    ExecutionPlan,
+    Executor,
     Fifo,
     Host2DeviceSemaphore,
     Host2HostSemaphore,
@@ -25,7 +28,7 @@
 import mscclpp.comm as mscclpp_comm
 from mscclpp.utils import KernelBuilder, pack
 from ._cpp import _ext
-from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group
+from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group, N_GPUS_PER_NODE
 
 ethernet_interface_name = "eth0"
 
@@ -590,3 +593,39 @@ def test_nvls(mpi_group: MpiGroup):
     kernel()
     cp.cuda.runtime.deviceSynchronize()
     group.barrier()
+
+
+@parametrize_mpi_groups(2)
+@pytest.mark.parametrize("filename", ["allreduce.json", "allreduce_packet.json"])
+def test_executor(mpi_group: MpiGroup, filename: str):
+    if all_ranks_on_the_same_node(mpi_group) is False:
+        pytest.skip("algo not support cross node")
+    project_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    mscclpp_group = mscclpp_comm.CommGroup(mpi_group.comm)
+    executor = Executor(mscclpp_group.communicator, N_GPUS_PER_NODE)
+    execution_plan = ExecutionPlan("allreduce_pairs", os.path.join(project_dir, "test", "execution-files", filename))
+
+    nelems = 1024 * 1024
+    cp.random.seed(42)
+    buffer = cp.random.random(nelems).astype(cp.float16)
+    sub_arrays = cp.split(buffer, mpi_group.comm.size)
+    sendbuf = sub_arrays[mpi_group.comm.rank]
+    expected = cp.zeros_like(sendbuf)
+    for i in range(mpi_group.comm.size):
+        expected += sub_arrays[i]
+    mscclpp_group.barrier()
+
+    stream = cp.cuda.Stream(non_blocking=True)
+    executor.execute(
+        mpi_group.comm.rank,
+        sendbuf.data.ptr,
+        sendbuf.data.ptr,
+        sendbuf.nbytes,
+        sendbuf.nbytes,
+        DataType.float16,
+        512,
+        execution_plan,
+        stream.ptr,
+    )
+    stream.synchronize()
+    assert cp.allclose(sendbuf, expected, atol=1e-3 * mpi_group.comm.size)

From 80a513e75ad9dfd61888dcbed2103160fb7c9842 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Tue, 9 Apr 2024 08:03:39 +0000
Subject: [PATCH 41/51] minor update

---
 python/test/executor_test.py   |  3 +++
 src/executor/execution_plan.cc | 11 +++++------
 src/executor/executor.cc       |  3 ++-
 src/include/execution_plan.hpp |  4 ++--
 4 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/python/test/executor_test.py b/python/test/executor_test.py
index c4cd0a87c..50b296b2d 100644
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -1,3 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
 from os import path
 from mscclpp import (
     DataType,
diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc
index fb3f3a027..5948d029d 100644
--- a/src/executor/execution_plan.cc
+++ b/src/executor/execution_plan.cc
@@ -119,7 +119,7 @@ std::vector<Operation> ExecutionPlan::Impl::getOperations(int rank, int threadbl
 
 int ExecutionPlan::Impl::getThreadblockCount(int rank) const { return this->operations.at(rank).size(); }
 
-void ExecutionPlan::Impl::loadExecutionPlan(size_t inputSize) {
+void ExecutionPlan::Impl::loadExecutionPlan(int rank, size_t inputSize) {
   std::ifstream file(this->planPath);
   json obj = json::parse(file);
   if (this->name != obj["name"]) {
@@ -139,14 +139,12 @@ void ExecutionPlan::Impl::loadExecutionPlan(size_t inputSize) {
   }
   this->setupChannels(gpus);
 
-  uint32_t maxInputChunks = 0;
-  for (const auto& [rank, chunks] : this->inputChunks) {
-    maxInputChunks = std::max(maxInputChunks, chunks);
-  }
-  this->chunkSize = inputSize / maxInputChunks;
+  this->chunkSize = inputSize / this->inputChunks[rank];
   this->setupOperations(gpus);
 }
 
+// Construct the channel info. Step 1. Flatten SM and PROXY channels into separate vectors.
+// Step 2. For each threadblock, construct a vector of channel indexes and keys.
 void ExecutionPlan::Impl::setupChannels(const json& gpus) {
   for (const auto& gpu : gpus) {
     int rank = gpu["id"];
@@ -227,6 +225,7 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus) {
           for (int i = 0; i < operation.nInputs; i++) {
             BufferType srcBufferType = convertToBufferType(op["i_buff"]["src"]);
             BufferType dstBufferType = convertToBufferType(op["i_buff"]["dst"]);
+            // Get the relevant channel index in rank channelInfos
             operation.inputChannelIndexes[i] =
                 channelIndexes[{srcBufferType, dstBufferType, operation.channelType}][op["i_cids"][i]["id"]];
             operation.inputOffsets[i] = this->chunkSize * (int)op["i_cids"][i]["off"];
diff --git a/src/executor/executor.cc b/src/executor/executor.cc
index 62bf33806..d4112f99b 100644
--- a/src/executor/executor.cc
+++ b/src/executor/executor.cc
@@ -83,7 +83,7 @@ struct Executor::Impl {
     if (this->contexts.find(key) != this->contexts.end()) {
       return this->contexts[key];
     }
-    plan.impl_->loadExecutionPlan(sendBufferSize);
+    plan.impl_->loadExecutionPlan(rank, sendBufferSize);
 
     ExecutionContext context;
     size_t scratchBufferSize = plan.impl_->getScratchBufferSize(rank, sendBufferSize);
@@ -285,6 +285,7 @@ void Executor::execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuff
                        PacketType packetType) {
   ExecutionContext context =
       this->impl_->setupExecutionContext(rank, sendbuff, recvBuff, sendBuffSize, recvBuffSize, plan);
+  // TODO(binyli): need to flush proxy channel here this->impl_->proxyService->startProxy();
   this->impl_->launchKernel(context, rank, nthreads, sendbuff, recvBuff, dataType, stream, packetType);
 }
 
diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp
index 450bb4f55..924d1358c 100644
--- a/src/include/execution_plan.hpp
+++ b/src/include/execution_plan.hpp
@@ -57,7 +57,7 @@ struct ExecutionPlan::Impl {
   std::vector<Operation> getOperations(int rank, int threadblock) const;
   int getThreadblockCount(int rank) const;
 
-  void loadExecutionPlan(size_t inputSize);
+  void loadExecutionPlan(int rank, size_t inputSize);
   void setupChannels(const nlohmann::json& gpus);
   void setupOperations(const nlohmann::json& gpus);
 
@@ -67,7 +67,7 @@ struct ExecutionPlan::Impl {
   // operations for [rank][threadblock] = [operations]
   std::unordered_map<int, std::vector<std::vector<Operation>>> operations;
   std::unordered_map<int, std::vector<ChannelInfo>> channelInfos;
-  // threadblockChannelMap[rank][threadblock] = [channelIndex]
+  // threadblockChannelMap[rank][threadblock] = [channelIndex, channelKey]
   std::unordered_map<int, std::vector<std::vector<std::pair<int, ChannelKey>>>> threadblockSMChannelMap;
   std::unordered_map<int, std::vector<std::vector<std::pair<int, ChannelKey>>>> threadblockProxyChannelMap;
   std::unordered_map<int, uint32_t> inputChunks;

From ab8d6d74f56d539479b16ea46f75ffbff1afeaf8 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Wed, 17 Apr 2024 05:57:23 +0000
Subject: [PATCH 42/51] minor improve

---
 src/include/execution_kernel.hpp | 62 +++++++++++++++-----------------
 1 file changed, 29 insertions(+), 33 deletions(-)

diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp
index ae0c40e26..5cf63af83 100644
--- a/src/include/execution_kernel.hpp
+++ b/src/include/execution_kernel.hpp
@@ -272,63 +272,59 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu
 #else   // !defined(MSCCLPP_DEVICE_HIP)
   __syncthreads();
 #endif  // !defined(MSCCLPP_DEVICE_HIP)
+  localPlan = (DeviceExecutionPlan*)sharedMem;
+  int nOperations = localPlan->nOperations;
   Operation* operations = localPlan->operations;
   DeviceHandle<SmChannel>* smChannels = localPlan->channels.smChannels;
   DeviceHandle<SimpleProxyChannel>* proxyChannels = localPlan->channels.proxyChannels;
   T* src = nullptr;
   T* dst = nullptr;
   T* tmp = nullptr;
-  for (int i = 0; i < localPlan->nOperations; i++) {
-    switch (operations[i].type) {
+  for (int i = 0; i < nOperations; i++) {
+    Operation* op = &operations[i];
+    switch (op->type) {
       case OperationType::BARRIER:
         __syncthreads();
         break;
       case OperationType::SIGNAL:
-        handleSignal(tid, smChannels, proxyChannels, operations[i].outputChannelIndexes, operations[i].nOutputs,
-                     operations[i].channelType);
+        handleSignal(tid, smChannels, proxyChannels, op->outputChannelIndexes, op->nOutputs, op->channelType);
         break;
       case OperationType::WAIT:
-        handleWait(tid, smChannels, proxyChannels, operations[i].inputChannelIndexes, operations[i].nInputs,
-                   operations[i].channelType);
+        handleWait(tid, smChannels, proxyChannels, op->inputChannelIndexes, op->nInputs, op->channelType);
         break;
       case OperationType::GET:
-        handleGet(smChannels[operations[i].inputChannelIndexes[0]], operations[i].inputOffsets[0],
-                  operations[i].dstOffset, operations[i].size);
+        handleGet(smChannels[op->inputChannelIndexes[0]], op->inputOffsets[0], op->dstOffset, op->size);
         break;
       case OperationType::READ_REDUCE_COPY_SEND:
-        dst = getBuffer(input, output, scratch, operations[i].dstBufferType);
-        src = getBuffer(input, output, scratch, operations[i].srcBufferType);
-        handleReadReduceCopySend(dst, operations[i].dstOffset, src, operations[i].srcOffset, smChannels,
-                                 operations[i].outputChannelIndexes, operations[i].inputChannelIndexes,
-                                 operations[i].outputOffsets, operations[i].inputOffsets, operations[i].nOutputs,
-                                 operations[i].nInputs, operations[i].size);
+        dst = getBuffer(input, output, scratch, op->dstBufferType);
+        src = getBuffer(input, output, scratch, op->srcBufferType);
+        handleReadReduceCopySend(dst, op->dstOffset, src, op->srcOffset, smChannels, op->outputChannelIndexes,
+                                 op->inputChannelIndexes, op->outputOffsets, op->inputOffsets, op->nOutputs,
+                                 op->nInputs, op->size);
         break;
       case OperationType::READ_REDUCE_COPY:
-        dst = getBuffer(input, output, scratch, operations[i].dstBufferType);
-        src = getBuffer(input, output, scratch, operations[i].srcBufferType);
-        handleReadReduceCopySend(dst, operations[i].dstOffset, src, operations[i].srcOffset, smChannels,
-                                 operations[i].outputChannelIndexes, operations[i].inputChannelIndexes,
-                                 operations[i].outputOffsets, operations[i].inputOffsets, operations[i].nOutputs,
-                                 operations[i].nInputs, operations[i].size, false);
+        dst = getBuffer(input, output, scratch, op->dstBufferType);
+        src = getBuffer(input, output, scratch, op->srcBufferType);
+        handleReadReduceCopySend(dst, op->dstOffset, src, op->srcOffset, smChannels, op->outputChannelIndexes,
+                                 op->inputChannelIndexes, op->outputOffsets, op->inputOffsets, op->nOutputs,
+                                 op->nInputs, op->size, false);
         break;
       case OperationType::PUT_PACKET:
-        handlePutPacket<PacketType>(operations[i].srcOffset, smChannels, operations[i].outputChannelIndexes,
-                                    operations[i].outputOffsets, operations[i].nOutputs, operations[i].size, flag);
+        handlePutPacket<PacketType>(op->srcOffset, smChannels, op->outputChannelIndexes, op->outputOffsets,
+                                    op->nOutputs, op->size, flag);
         break;
       case OperationType::REDUCE_SEND_PACKET:
-        dst = getBuffer(input, output, scratch, operations[i].dstBufferType);
-        src = getBuffer(input, output, scratch, operations[i].srcBufferType);
-        tmp = getBuffer(input, output, scratch, operations[i].inputBufferType);
-        handleReduceSendPacket<T, PacketType>(dst, operations[i].dstOffset, src, operations[i].srcOffset, tmp,
-                                              operations[i].inputOffsets, operations[i].nInputs, smChannels,
-                                              operations[i].outputChannelIndexes, operations[i].outputOffsets,
-                                              operations[i].nOutputs, operations[i].size, flag);
+        dst = getBuffer(input, output, scratch, op->dstBufferType);
+        src = getBuffer(input, output, scratch, op->srcBufferType);
+        tmp = getBuffer(input, output, scratch, op->inputBufferType);
+        handleReduceSendPacket<T, PacketType>(dst, op->dstOffset, src, op->srcOffset, tmp, op->inputOffsets,
+                                              op->nInputs, smChannels, op->outputChannelIndexes, op->outputOffsets,
+                                              op->nOutputs, op->size, flag);
         break;
       case OperationType::COPY_PACKET:
-        dst = getBuffer(input, output, scratch, operations[i].dstBufferType);
-        src = getBuffer(input, output, scratch, operations[i].srcBufferType);
-        handleCopyPacket<PacketType>(dst, src, operations[i].dstOffset, operations[i].srcOffset, operations[i].size,
-                                     flag);
+        dst = getBuffer(input, output, scratch, op->dstBufferType);
+        src = getBuffer(input, output, scratch, op->srcBufferType);
+        handleCopyPacket<PacketType>(dst, src, op->dstOffset, op->srcOffset, op->size, flag);
         break;
       default:
         break;

From 48d877c9b1c4a4e80516c310228153b53154a6c4 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Wed, 17 Apr 2024 08:38:23 +0000
Subject: [PATCH 43/51] WIP

---
 src/include/execution_common.hpp | 4 ++--
 src/include/execution_kernel.hpp | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/include/execution_common.hpp b/src/include/execution_common.hpp
index 685317268..38d624109 100644
--- a/src/include/execution_common.hpp
+++ b/src/include/execution_common.hpp
@@ -67,8 +67,8 @@ struct Operation {
   uint32_t size;
 };
 
-// total size = 1920 + 6400 + 4 + 4(padding) = 8324 bytes
-struct DeviceExecutionPlan {
+// total size = 1920 + 6400 + 4 + 4(padding) + 12(align) = 8336 bytes
+struct __attribute__((aligned(16))) DeviceExecutionPlan {
   uint8_t nSmChannels;                  // 1 bytes
   uint8_t nProxyChannels;               // 1 bytes
   uint16_t nOperations;                 // 2 bytes
diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp
index 5cf63af83..9fadb2bc1 100644
--- a/src/include/execution_kernel.hpp
+++ b/src/include/execution_kernel.hpp
@@ -260,12 +260,12 @@ MSCCLPP_DEVICE_INLINE void handleCopyPacket(void* dst, void* src, uint32_t dstOf
 template <typename T, typename PacketType = LL16Packet>
 __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* input, T* output, T* scratch,
                                 DeviceExecutionPlan* plan, uint32_t flag) {
-  extern __shared__ int sharedMem[];
+  extern __shared__ int4 sharedMem[];
   int bid = blockIdx.x;
   int tid = threadIdx.x;
   DeviceExecutionPlan* localPlan = plan + bid;
-  for (size_t i = tid; i < sizeof(DeviceExecutionPlan) / sizeof(int); i += blockDim.x) {
-    sharedMem[i] = ((int*)localPlan)[i];
+  for (size_t i = tid; i < sizeof(DeviceExecutionPlan) / sizeof(int4); i += blockDim.x) {
+    sharedMem[i] = ((int4*)localPlan)[i];
   }
 #if defined(MSCCLPP_DEVICE_HIP)
   __synclds();

From d6b03669070e0244280789ba9033831db0ce997d Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Wed, 17 Apr 2024 14:53:35 +0000
Subject: [PATCH 44/51] packet fix

---
 src/executor/execution_kernel.cu | 26 ++++++++-------
 src/executor/execution_plan.cc   |  3 +-
 src/executor/executor.cc         | 10 +++---
 src/include/execution_kernel.hpp | 55 ++++++++++++++++----------------
 4 files changed, 50 insertions(+), 44 deletions(-)

diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu
index 7aca5b1ed..4e96af9ab 100644
--- a/src/executor/execution_kernel.cu
+++ b/src/executor/execution_kernel.cu
@@ -8,33 +8,35 @@ namespace mscclpp {
 
 template <typename PacketType>
 void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch,
-                                   DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize,
-                                   cudaStream_t stream, uint32_t flag) {
+                                   size_t scratchSize, DataType dataType, DeviceExecutionPlan* plan,
+                                   size_t sharedMemSize, cudaStream_t stream, uint32_t flag) {
   switch (dataType) {
     case DataType::INT32:
       executionKernel<int32_t, PacketType><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
-          rank, (int32_t*)src, (int32_t*)dst, (int32_t*)scratch, plan, flag);
+          rank, (int32_t*)src, (int32_t*)dst, (int32_t*)scratch, scratchSize, plan, flag);
       break;
     case DataType::UINT32:
       executionKernel<uint32_t><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
-          rank, (uint32_t*)src, (uint32_t*)dst, (uint32_t*)scratch, plan, flag);
+          rank, (uint32_t*)src, (uint32_t*)dst, (uint32_t*)scratch, scratchSize, plan, flag);
       break;
     case DataType::FLOAT16:
-      executionKernel<half><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (half*)src, (half*)dst,
-                                                                                (half*)scratch, plan, flag);
+      executionKernel<half><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
+          rank, (half*)src, (half*)dst, (half*)scratch, scratchSize, plan, flag);
       break;
     case DataType::FLOAT32:
-      executionKernel<float><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(rank, (float*)src, (float*)dst,
-                                                                                 (float*)scratch, plan, flag);
+      executionKernel<float><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
+          rank, (float*)src, (float*)dst, (float*)scratch, scratchSize, plan, flag);
       break;
   }
 }
 
 template void ExecutionKernel::launchKernel<LL16Packet>(int rank, int nthreadblocks, int nthreads, void* src, void* dst,
-                                                        void* scratch, DataType dataType, DeviceExecutionPlan* plan,
-                                                        size_t sharedMemSize, cudaStream_t stream, uint32_t flag);
+                                                        void* scratch, size_t scratchSize, DataType dataType,
+                                                        DeviceExecutionPlan* plan, size_t sharedMemSize,
+                                                        cudaStream_t stream, uint32_t flag);
 template void ExecutionKernel::launchKernel<LL8Packet>(int rank, int nthreadblocks, int nthreads, void* src, void* dst,
-                                                       void* scratch, DataType dataType, DeviceExecutionPlan* plan,
-                                                       size_t sharedMemSize, cudaStream_t stream, uint32_t flag);
+                                                       void* scratch, size_t scratchSize, DataType dataType,
+                                                       DeviceExecutionPlan* plan, size_t sharedMemSize,
+                                                       cudaStream_t stream, uint32_t flag);
 }  // namespace mscclpp
 #endif
diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc
index 5948d029d..341c9bd9f 100644
--- a/src/executor/execution_plan.cc
+++ b/src/executor/execution_plan.cc
@@ -109,7 +109,8 @@ std::vector<BufferType> ExecutionPlan::Impl::getConnectedBufferTypes(int rank) c
 }
 size_t ExecutionPlan::Impl::getScratchBufferSize(int rank, size_t inputSize) const {
   if (this->isUsingPacket) {
-    return inputSize / this->inputChunks.at(rank) * this->scratchChunks.at(rank) * 2;
+    return inputSize / this->inputChunks.at(rank) * this->scratchChunks.at(rank) * 2 /* data + flag*/ *
+           2 /*double buffer*/;
   }
   return inputSize / this->inputChunks.at(rank) * this->scratchChunks.at(rank);
 }
diff --git a/src/executor/executor.cc b/src/executor/executor.cc
index d4112f99b..5b1a827c6 100644
--- a/src/executor/executor.cc
+++ b/src/executor/executor.cc
@@ -263,13 +263,15 @@ struct Executor::Impl {
     switch (packetType) {
       case PacketType::LL16:
         ExecutionKernel::launchKernel<LL16Packet>(
-            rank, nthreadblocks, nthreadsPerBlock, sendbuff, recvbuff, (void*)context.scratchBuffer.get(), dataType,
-            (DeviceExecutionPlan*)context.deviceExecutionPlansBuffer.get(), sharedMemSize, stream, ++flag);
+            rank, nthreadblocks, nthreadsPerBlock, sendbuff, recvbuff, (void*)context.scratchBuffer.get(),
+            context.scratchBufferSize, dataType, (DeviceExecutionPlan*)context.deviceExecutionPlansBuffer.get(),
+            sharedMemSize, stream, ++flag);
         break;
       case PacketType::LL8:
         ExecutionKernel::launchKernel<LL8Packet>(
-            rank, nthreadblocks, nthreadsPerBlock, sendbuff, recvbuff, (void*)context.scratchBuffer.get(), dataType,
-            (DeviceExecutionPlan*)context.deviceExecutionPlansBuffer.get(), sharedMemSize, stream, ++flag);
+            rank, nthreadblocks, nthreadsPerBlock, sendbuff, recvbuff, (void*)context.scratchBuffer.get(),
+            context.scratchBufferSize, dataType, (DeviceExecutionPlan*)context.deviceExecutionPlansBuffer.get(),
+            sharedMemSize, stream, ++flag);
         break;
       default:
         throw std::runtime_error("Invalid packet type");
diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp
index 9fadb2bc1..4cfd1698a 100644
--- a/src/include/execution_kernel.hpp
+++ b/src/include/execution_kernel.hpp
@@ -207,22 +207,24 @@ MSCCLPP_DEVICE_INLINE void handleReadReduceCopySend(T* output, uint32_t outputOf
 }
 
 template <typename PacketType>
-MSCCLPP_DEVICE_INLINE void handlePutPacket(uint32_t inputOffsetByBytes, DeviceHandle<SmChannel>* smChannels,
-                                           uint8_t* dstChannelIndexes, uint32_t* dstOffsets, int nDstChannels,
-                                           uint32_t size, uint32_t flag) {
+MSCCLPP_DEVICE_INLINE void handlePutPacket(uint32_t inputOffsetByBytes, size_t scratchSize,
+                                           DeviceHandle<SmChannel>* smChannels, uint8_t* dstChannelIndexes,
+                                           uint32_t* dstOffsets, int nDstChannels, uint32_t size, uint32_t flag) {
+  const size_t scratchBaseOffset = flag & 0x1 ? 0 : scratchSize >> 1;
   for (int index = 0; index < nDstChannels; ++index) {
-    smChannels[dstChannelIndexes[index]].putPackets<PacketType>(dstOffsets[index] * 2, inputOffsetByBytes, size,
-                                                                threadIdx.x, blockDim.x, flag);
+    smChannels[dstChannelIndexes[index]].putPackets<PacketType>(
+        scratchBaseOffset + dstOffsets[index] * 2, inputOffsetByBytes, size, threadIdx.x, blockDim.x, flag);
   }
 }
 
 template <typename T, typename PacketType>
 MSCCLPP_DEVICE_INLINE void handleReduceSendPacket(T* dst, uint32_t dstOffsetByBytes, T* src, uint32_t srcOffsetByBytes,
-                                                  T* inputBuff, uint32_t* inputOffsets, int nSrcs,
+                                                  T* inputBuff, size_t inputBuffSize, uint32_t* inputOffsets, int nSrcs,
                                                   DeviceHandle<SmChannel>* smChannels, uint8_t* outputChannelIndexes,
                                                   uint32_t* outputOffsets, int nDstChannels, size_t size,
                                                   uint32_t flag) {
   size_t nPackets = size * 2 / sizeof(PacketType);
+  const size_t intputBaseOffset = flag & 0x1 ? 0 : inputBuffSize >> 1;
   const uint32_t srcOffset = srcOffsetByBytes / sizeof(PacketValType<PacketType>);
   const uint32_t dstOffset = dstOffsetByBytes / sizeof(PacketValType<PacketType>);
   PacketValType<PacketType>* srcPacketValue = (PacketValType<PacketType>*)src + srcOffset;
@@ -230,7 +232,7 @@ MSCCLPP_DEVICE_INLINE void handleReduceSendPacket(T* dst, uint32_t dstOffsetByBy
   for (size_t idx = threadIdx.x; idx < nPackets; idx += blockDim.x) {
     PacketValType<PacketType> data = {};
     for (int index = 0; index < nSrcs; ++index) {
-      PacketType* pkt = (PacketType*)((char*)inputBuff + 2 * inputOffsets[index]);
+      PacketType* pkt = (PacketType*)((char*)inputBuff + intputBaseOffset + 2 * inputOffsets[index]);
       PacketValType<PacketType> val = pkt[idx].read(flag);
       data = add_vectors<T>(data, val);
     }
@@ -239,16 +241,17 @@ MSCCLPP_DEVICE_INLINE void handleReduceSendPacket(T* dst, uint32_t dstOffsetByBy
 
     PacketType pkt(data, flag);
     for (int index = 0; index < nDstChannels; ++index) {
-      size_t offset = (outputOffsets[index] * 2) / sizeof(PacketType);
+      size_t offset = (intputBaseOffset + outputOffsets[index] * 2) / sizeof(PacketType);
       smChannels[outputChannelIndexes[index]].write(offset + idx, pkt);
     }
   }
 }
 
 template <typename PacketType>
-MSCCLPP_DEVICE_INLINE void handleCopyPacket(void* dst, void* src, uint32_t dstOffset, uint32_t srcOffset, size_t size,
-                                            uint32_t flag) {
-  PacketType* srcPackets = (PacketType*)((char*)src + 2 * srcOffset);
+MSCCLPP_DEVICE_INLINE void handleCopyPacket(void* dst, void* src, size_t srcSize, uint32_t dstOffset,
+                                            uint32_t srcOffset, size_t size, uint32_t flag) {
+  const size_t outputScratchBaseOffset = flag & 0x1 ? 0 : srcSize >> 1;
+  PacketType* srcPackets = (PacketType*)((char*)src + outputScratchBaseOffset + 2 * srcOffset);
   PacketValType<PacketType>* result = (PacketValType<PacketType>*)((char*)dst + dstOffset);
   size_t nPackets = size * 2 / sizeof(PacketType);
   for (size_t idx = threadIdx.x; idx < nPackets; idx += blockDim.x) {
@@ -259,7 +262,7 @@ MSCCLPP_DEVICE_INLINE void handleCopyPacket(void* dst, void* src, uint32_t dstOf
 
 template <typename T, typename PacketType = LL16Packet>
 __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* input, T* output, T* scratch,
-                                DeviceExecutionPlan* plan, uint32_t flag) {
+                                size_t scratchSize, DeviceExecutionPlan* plan, uint32_t flag) {
   extern __shared__ int4 sharedMem[];
   int bid = blockIdx.x;
   int tid = threadIdx.x;
@@ -279,7 +282,6 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu
   DeviceHandle<SimpleProxyChannel>* proxyChannels = localPlan->channels.proxyChannels;
   T* src = nullptr;
   T* dst = nullptr;
-  T* tmp = nullptr;
   for (int i = 0; i < nOperations; i++) {
     Operation* op = &operations[i];
     switch (op->type) {
@@ -310,21 +312,20 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu
                                  op->nInputs, op->size, false);
         break;
       case OperationType::PUT_PACKET:
-        handlePutPacket<PacketType>(op->srcOffset, smChannels, op->outputChannelIndexes, op->outputOffsets,
+        handlePutPacket<PacketType>(op->srcOffset, scratchSize, smChannels, op->outputChannelIndexes, op->outputOffsets,
                                     op->nOutputs, op->size, flag);
         break;
       case OperationType::REDUCE_SEND_PACKET:
         dst = getBuffer(input, output, scratch, op->dstBufferType);
         src = getBuffer(input, output, scratch, op->srcBufferType);
-        tmp = getBuffer(input, output, scratch, op->inputBufferType);
-        handleReduceSendPacket<T, PacketType>(dst, op->dstOffset, src, op->srcOffset, tmp, op->inputOffsets,
-                                              op->nInputs, smChannels, op->outputChannelIndexes, op->outputOffsets,
-                                              op->nOutputs, op->size, flag);
+        handleReduceSendPacket<T, PacketType>(dst, op->dstOffset, src, op->srcOffset, scratch, scratchSize,
+                                              op->inputOffsets, op->nInputs, smChannels, op->outputChannelIndexes,
+                                              op->outputOffsets, op->nOutputs, op->size, flag);
         break;
       case OperationType::COPY_PACKET:
         dst = getBuffer(input, output, scratch, op->dstBufferType);
         src = getBuffer(input, output, scratch, op->srcBufferType);
-        handleCopyPacket<PacketType>(dst, src, op->dstOffset, op->srcOffset, op->size, flag);
+        handleCopyPacket<PacketType>(dst, src, scratchSize, op->dstOffset, op->srcOffset, op->size, flag);
         break;
       default:
         break;
@@ -338,32 +339,32 @@ class ExecutionKernel {
 #if defined(MSCCLPP_DEVICE_HIP)
   template <typename PacketType>
   static void launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch,
-                           DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, cudaStream_t stream,
-                           uint32_t flag = 0) {
+                           size_t scratchSize, DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize,
+                           cudaStream_t stream, uint32_t flag = 0) {
     switch (dataType) {
       case DataType::INT32:
         executionKernel<int32_t, PacketType><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
-            rank, (int32_t*)src, (int32_t*)dst, (int32_t*)scratch, plan, flag);
+            rank, (int32_t*)src, (int32_t*)dst, (int32_t*)scratch, scratchSize, plan, flag);
         break;
       case DataType::UINT32:
         executionKernel<uint32_t, PacketType><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
-            rank, (uint32_t*)src, (uint32_t*)dst, (uint32_t*)scratch, plan, flag);
+            rank, (uint32_t*)src, (uint32_t*)dst, (uint32_t*)scratch, scratchSize, plan, flag);
         break;
       case DataType::FLOAT16:
         executionKernel<half, PacketType><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
-            rank, (half*)src, (half*)dst, (half*)scratch, plan, flag);
+            rank, (half*)src, (half*)dst, (half*)scratch, scratchSize, plan, flag);
         break;
       case DataType::FLOAT32:
         executionKernel<float, PacketType><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
-            rank, (float*)src, (float*)dst, (float*)scratch, plan, flag);
+            rank, (float*)src, (float*)dst, (float*)scratch, scratchSize, plan, flag);
         break;
     }
   }
 #else   // !defined(MSCCLPP_DEVICE_HIP)
   template <typename PacketType>
   static void launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch,
-                           DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, cudaStream_t stream,
-                           uint32_t flag = 0);
+                           size_t scratchSize, DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize,
+                           cudaStream_t stream, uint32_t flag = 0);
 #endif  // !defined(MSCCLPP_DEVICE_HIP)
 };
 }  // namespace mscclpp

From 0c2b2c14f4b82118b06cf79c8410e40223a19c54 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 17 Apr 2024 19:07:25 +0000
Subject: [PATCH 45/51] Minor updates

---
 include/mscclpp/executor.hpp     |  2 +-
 src/executor/execution_plan.cc   |  2 +-
 src/include/execution_kernel.hpp | 53 ++++++++++++++++----------------
 3 files changed, 28 insertions(+), 29 deletions(-)

diff --git a/include/mscclpp/executor.hpp b/include/mscclpp/executor.hpp
index f54c80585..ab54c2596 100644
--- a/include/mscclpp/executor.hpp
+++ b/include/mscclpp/executor.hpp
@@ -24,7 +24,7 @@ enum class PacketType {
 
 class ExecutionPlan {
  public:
-  ExecutionPlan(const std::string name, const std::string planPath);
+  ExecutionPlan(const std::string& name, const std::string& planPath);
   ~ExecutionPlan() = default;
 
  private:
diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc
index 341c9bd9f..9decbed1b 100644
--- a/src/executor/execution_plan.cc
+++ b/src/executor/execution_plan.cc
@@ -130,7 +130,7 @@ void ExecutionPlan::Impl::loadExecutionPlan(int rank, size_t inputSize) {
   if (protocol == "LL") {
     this->isUsingPacket = true;
   }
-  auto gpus = obj["gpus"];
+  const auto& gpus = obj["gpus"];
 
   for (const auto& gpu : gpus) {
     int rank = gpu["id"];
diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp
index 4cfd1698a..3e8d05f50 100644
--- a/src/include/execution_kernel.hpp
+++ b/src/include/execution_kernel.hpp
@@ -280,52 +280,51 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu
   Operation* operations = localPlan->operations;
   DeviceHandle<SmChannel>* smChannels = localPlan->channels.smChannels;
   DeviceHandle<SimpleProxyChannel>* proxyChannels = localPlan->channels.proxyChannels;
-  T* src = nullptr;
-  T* dst = nullptr;
+
   for (int i = 0; i < nOperations; i++) {
-    Operation* op = &operations[i];
-    switch (op->type) {
+    Operation& op = operations[i];
+    switch (op.type) {
       case OperationType::BARRIER:
         __syncthreads();
         break;
       case OperationType::SIGNAL:
-        handleSignal(tid, smChannels, proxyChannels, op->outputChannelIndexes, op->nOutputs, op->channelType);
+        handleSignal(tid, smChannels, proxyChannels, op.outputChannelIndexes, op.nOutputs, op.channelType);
         break;
       case OperationType::WAIT:
-        handleWait(tid, smChannels, proxyChannels, op->inputChannelIndexes, op->nInputs, op->channelType);
+        handleWait(tid, smChannels, proxyChannels, op.inputChannelIndexes, op.nInputs, op.channelType);
         break;
       case OperationType::GET:
-        handleGet(smChannels[op->inputChannelIndexes[0]], op->inputOffsets[0], op->dstOffset, op->size);
+        handleGet(smChannels[op.inputChannelIndexes[0]], op.inputOffsets[0], op.dstOffset, op.size);
         break;
       case OperationType::READ_REDUCE_COPY_SEND:
-        dst = getBuffer(input, output, scratch, op->dstBufferType);
-        src = getBuffer(input, output, scratch, op->srcBufferType);
-        handleReadReduceCopySend(dst, op->dstOffset, src, op->srcOffset, smChannels, op->outputChannelIndexes,
-                                 op->inputChannelIndexes, op->outputOffsets, op->inputOffsets, op->nOutputs,
-                                 op->nInputs, op->size);
+        T* dst = getBuffer(input, output, scratch, op.dstBufferType);
+        T* src = getBuffer(input, output, scratch, op.srcBufferType);
+        handleReadReduceCopySend(dst, op.dstOffset, src, op.srcOffset, smChannels, op.outputChannelIndexes,
+                                 op.inputChannelIndexes, op.outputOffsets, op.inputOffsets, op.nOutputs,
+                                 op.nInputs, op.size);
         break;
       case OperationType::READ_REDUCE_COPY:
-        dst = getBuffer(input, output, scratch, op->dstBufferType);
-        src = getBuffer(input, output, scratch, op->srcBufferType);
-        handleReadReduceCopySend(dst, op->dstOffset, src, op->srcOffset, smChannels, op->outputChannelIndexes,
-                                 op->inputChannelIndexes, op->outputOffsets, op->inputOffsets, op->nOutputs,
-                                 op->nInputs, op->size, false);
+        T* dst = getBuffer(input, output, scratch, op.dstBufferType);
+        T* src = getBuffer(input, output, scratch, op.srcBufferType);
+        handleReadReduceCopySend(dst, op.dstOffset, src, op.srcOffset, smChannels, op.outputChannelIndexes,
+                                 op.inputChannelIndexes, op.outputOffsets, op.inputOffsets, op.nOutputs,
+                                 op.nInputs, op.size, false);
         break;
       case OperationType::PUT_PACKET:
-        handlePutPacket<PacketType>(op->srcOffset, scratchSize, smChannels, op->outputChannelIndexes, op->outputOffsets,
-                                    op->nOutputs, op->size, flag);
+        handlePutPacket<PacketType>(op.srcOffset, scratchSize, smChannels, op.outputChannelIndexes, op.outputOffsets,
+                                    op.nOutputs, op.size, flag);
         break;
       case OperationType::REDUCE_SEND_PACKET:
-        dst = getBuffer(input, output, scratch, op->dstBufferType);
-        src = getBuffer(input, output, scratch, op->srcBufferType);
-        handleReduceSendPacket<T, PacketType>(dst, op->dstOffset, src, op->srcOffset, scratch, scratchSize,
-                                              op->inputOffsets, op->nInputs, smChannels, op->outputChannelIndexes,
-                                              op->outputOffsets, op->nOutputs, op->size, flag);
+        T* dst = getBuffer(input, output, scratch, op.dstBufferType);
+        T* src = getBuffer(input, output, scratch, op.srcBufferType);
+        handleReduceSendPacket<T, PacketType>(dst, op.dstOffset, src, op.srcOffset, scratch, scratchSize,
+                                              op.inputOffsets, op.nInputs, smChannels, op.outputChannelIndexes,
+                                              op.outputOffsets, op.nOutputs, op.size, flag);
         break;
       case OperationType::COPY_PACKET:
-        dst = getBuffer(input, output, scratch, op->dstBufferType);
-        src = getBuffer(input, output, scratch, op->srcBufferType);
-        handleCopyPacket<PacketType>(dst, src, scratchSize, op->dstOffset, op->srcOffset, op->size, flag);
+        T* dst = getBuffer(input, output, scratch, op.dstBufferType);
+        T* src = getBuffer(input, output, scratch, op.srcBufferType);
+        handleCopyPacket<PacketType>(dst, src, scratchSize, op.dstOffset, op.srcOffset, op.size, flag);
         break;
       default:
         break;

From 8d1b644e05b57931246d6f57d7bf2c3273672d58 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 17 Apr 2024 20:16:15 +0000
Subject: [PATCH 46/51] minor updates

---
 src/executor/execution_plan.cc   |  2 +-
 src/include/execution_kernel.hpp | 78 ++++++++++++++------------------
 2 files changed, 34 insertions(+), 46 deletions(-)

diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc
index 9decbed1b..60fb2b438 100644
--- a/src/executor/execution_plan.cc
+++ b/src/executor/execution_plan.cc
@@ -272,7 +272,7 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus) {
   }
 }
 
-ExecutionPlan::ExecutionPlan(const std::string name, const std::string planPath)
+ExecutionPlan::ExecutionPlan(const std::string& name, const std::string& planPath)
     : impl_(std::make_shared<Impl>(name, planPath)) {}
 
 }  // namespace mscclpp
diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp
index 3e8d05f50..9749faf69 100644
--- a/src/include/execution_kernel.hpp
+++ b/src/include/execution_kernel.hpp
@@ -283,51 +283,39 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu
 
   for (int i = 0; i < nOperations; i++) {
     Operation& op = operations[i];
-    switch (op.type) {
-      case OperationType::BARRIER:
-        __syncthreads();
-        break;
-      case OperationType::SIGNAL:
-        handleSignal(tid, smChannels, proxyChannels, op.outputChannelIndexes, op.nOutputs, op.channelType);
-        break;
-      case OperationType::WAIT:
-        handleWait(tid, smChannels, proxyChannels, op.inputChannelIndexes, op.nInputs, op.channelType);
-        break;
-      case OperationType::GET:
-        handleGet(smChannels[op.inputChannelIndexes[0]], op.inputOffsets[0], op.dstOffset, op.size);
-        break;
-      case OperationType::READ_REDUCE_COPY_SEND:
-        T* dst = getBuffer(input, output, scratch, op.dstBufferType);
-        T* src = getBuffer(input, output, scratch, op.srcBufferType);
-        handleReadReduceCopySend(dst, op.dstOffset, src, op.srcOffset, smChannels, op.outputChannelIndexes,
-                                 op.inputChannelIndexes, op.outputOffsets, op.inputOffsets, op.nOutputs,
-                                 op.nInputs, op.size);
-        break;
-      case OperationType::READ_REDUCE_COPY:
-        T* dst = getBuffer(input, output, scratch, op.dstBufferType);
-        T* src = getBuffer(input, output, scratch, op.srcBufferType);
-        handleReadReduceCopySend(dst, op.dstOffset, src, op.srcOffset, smChannels, op.outputChannelIndexes,
-                                 op.inputChannelIndexes, op.outputOffsets, op.inputOffsets, op.nOutputs,
-                                 op.nInputs, op.size, false);
-        break;
-      case OperationType::PUT_PACKET:
-        handlePutPacket<PacketType>(op.srcOffset, scratchSize, smChannels, op.outputChannelIndexes, op.outputOffsets,
-                                    op.nOutputs, op.size, flag);
-        break;
-      case OperationType::REDUCE_SEND_PACKET:
-        T* dst = getBuffer(input, output, scratch, op.dstBufferType);
-        T* src = getBuffer(input, output, scratch, op.srcBufferType);
-        handleReduceSendPacket<T, PacketType>(dst, op.dstOffset, src, op.srcOffset, scratch, scratchSize,
-                                              op.inputOffsets, op.nInputs, smChannels, op.outputChannelIndexes,
-                                              op.outputOffsets, op.nOutputs, op.size, flag);
-        break;
-      case OperationType::COPY_PACKET:
-        T* dst = getBuffer(input, output, scratch, op.dstBufferType);
-        T* src = getBuffer(input, output, scratch, op.srcBufferType);
-        handleCopyPacket<PacketType>(dst, src, scratchSize, op.dstOffset, op.srcOffset, op.size, flag);
-        break;
-      default:
-        break;
+    if (op.type == OperationType::BARRIER) {
+      __syncthreads();
+    } else if (op.type == OperationType::SIGNAL) {
+      handleSignal(tid, smChannels, proxyChannels, op.outputChannelIndexes, op.nOutputs, op.channelType);
+    } else if (op.type == OperationType::WAIT) {
+      handleWait(tid, smChannels, proxyChannels, op.inputChannelIndexes, op.nInputs, op.channelType);
+    } else if (op.type == OperationType::GET) {
+      handleGet(smChannels[op.inputChannelIndexes[0]], op.inputOffsets[0], op.dstOffset, op.size);
+    } else if (op.type == OperationType::READ_REDUCE_COPY_SEND) {
+      T* dst = getBuffer(input, output, scratch, op.dstBufferType);
+      T* src = getBuffer(input, output, scratch, op.srcBufferType);
+      handleReadReduceCopySend(dst, op.dstOffset, src, op.srcOffset, smChannels, op.outputChannelIndexes,
+                               op.inputChannelIndexes, op.outputOffsets, op.inputOffsets, op.nOutputs, op.nInputs,
+                               op.size);
+    } else if (op.type == OperationType::READ_REDUCE_COPY) {
+      T* dst = getBuffer(input, output, scratch, op.dstBufferType);
+      T* src = getBuffer(input, output, scratch, op.srcBufferType);
+      handleReadReduceCopySend(dst, op.dstOffset, src, op.srcOffset, smChannels, op.outputChannelIndexes,
+                               op.inputChannelIndexes, op.outputOffsets, op.inputOffsets, op.nOutputs, op.nInputs,
+                               op.size, false);
+    } else if (op.type == OperationType::PUT_PACKET) {
+      handlePutPacket<PacketType>(op.srcOffset, scratchSize, smChannels, op.outputChannelIndexes, op.outputOffsets,
+                                  op.nOutputs, op.size, flag);
+    } else if (op.type == OperationType::REDUCE_SEND_PACKET) {
+      T* dst = getBuffer(input, output, scratch, op.dstBufferType);
+      T* src = getBuffer(input, output, scratch, op.srcBufferType);
+      handleReduceSendPacket<T, PacketType>(dst, op.dstOffset, src, op.srcOffset, scratch, scratchSize, op.inputOffsets,
+                                            op.nInputs, smChannels, op.outputChannelIndexes, op.outputOffsets,
+                                            op.nOutputs, op.size, flag);
+    } else if (op.type == OperationType::COPY_PACKET) {
+      T* dst = getBuffer(input, output, scratch, op.dstBufferType);
+      T* src = getBuffer(input, output, scratch, op.srcBufferType);
+      handleCopyPacket<PacketType>(dst, src, scratchSize, op.dstOffset, op.srcOffset, op.size, flag);
     }
   }
 }

From c29df8eca260fe4326f8a3703c41c6dbd0174e36 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Thu, 18 Apr 2024 03:17:50 +0000
Subject: [PATCH 47/51] Fix hang

---
 python/test/executor_test.py               | 12 +++++++-----
 test/execution-files/allreduce_packet.json |  2 +-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/python/test/executor_test.py b/python/test/executor_test.py
index 50b296b2d..239ecd4a7 100644
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -15,7 +15,7 @@
 MSCCLPP_ROOT_PATH = "/root/mscclpp"
 
 
-def bench_time(niters: int, func):
+def bench_time(niters: int, ngraphIters: int, func):
     # capture cuda graph for niters of the kernel launch
     stream = cp.cuda.Stream(non_blocking=True)
     with stream:
@@ -32,11 +32,12 @@ def bench_time(niters: int, func):
     end = cp.cuda.Event()
 
     start.record(stream)
-    graph.launch(stream)
+    for _ in range(ngraphIters):
+        graph.launch(stream)
     end.record(stream)
     end.synchronize()
 
-    return cp.cuda.get_elapsed_time(start, end) / niters * 1000.0
+    return cp.cuda.get_elapsed_time(start, end) / niters * 1000.0 / ngraphIters
 
 
 if __name__ == "__main__":
@@ -59,7 +60,8 @@ def bench_time(niters: int, func):
     mscclpp_group.barrier()
 
     execution_time = bench_time(
-        1000,
+        100,
+        10,
         lambda stream: executor.execute(
             MPI.COMM_WORLD.rank,
             sendbuf.data.ptr,
@@ -72,6 +74,6 @@ def bench_time(niters: int, func):
             stream.ptr,
         ),
     )
-    print(f"Execution time: {execution_time} us, data size: {sendbuf.nbytes} bytes")
+    print(f"Rank: {MPI.COMM_WORLD.rank} Execution time: {execution_time} us, data size: {sendbuf.nbytes} bytes")
     executor = None
     mscclpp_group = None
diff --git a/test/execution-files/allreduce_packet.json b/test/execution-files/allreduce_packet.json
index 7045f21c2..c01ae4fd9 100644
--- a/test/execution-files/allreduce_packet.json
+++ b/test/execution-files/allreduce_packet.json
@@ -169,7 +169,7 @@
       "id": 1,
       "inputChunks": 4,
       "outputChunks": 0,
-      "scratchChunks": 6,
+      "scratchChunks": 8,
       "threadblocks": [
         {
           "id": 0,

From 4a739909c9c27959cb4b26fc8bf0a0db0419489f Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Thu, 18 Apr 2024 05:36:10 +0000
Subject: [PATCH 48/51] address comments

---
 include/mscclpp/core.hpp          |  2 +-
 include/mscclpp/errors.hpp        |  1 +
 include/mscclpp/packet_device.hpp |  4 ++--
 src/errors.cc                     |  4 ++++
 src/executor/execution_plan.cc    |  8 ++++----
 src/executor/executor.cc          |  6 +++---
 src/include/execution_kernel.hpp  | 20 ++++++++++----------
 7 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index 456020975..132df587a 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -762,7 +762,7 @@ DeviceHandle<std::remove_reference_t<T>> deviceHandle(T&& t) {
 
 /// Packet value type.
 template <class T>
-using PacketValType = typename T::ValueType;
+using PacketPayload = typename T::Payload;
 
 }  // namespace mscclpp
 
diff --git a/include/mscclpp/errors.hpp b/include/mscclpp/errors.hpp
index 4e90c8d84..8d3fde4d1 100644
--- a/include/mscclpp/errors.hpp
+++ b/include/mscclpp/errors.hpp
@@ -16,6 +16,7 @@ enum class ErrorCode {
   InvalidUsage,   // The function was used incorrectly.
   Timeout,        // The operation timed out.
   Aborted,        // The operation was aborted.
+  ExecutorError,  // An error occurred in the MSCCL++ executor.
 };
 
 /// Convert an error code to a string.
diff --git a/include/mscclpp/packet_device.hpp b/include/mscclpp/packet_device.hpp
index a20c8abec..8cff4c790 100644
--- a/include/mscclpp/packet_device.hpp
+++ b/include/mscclpp/packet_device.hpp
@@ -24,7 +24,7 @@ union alignas(16) LL16Packet {
     uint32_t data2;
     uint32_t flag2;
   };
-  using ValueType = uint2;
+  using Payload = uint2;
 
 #if defined(MSCCLPP_DEVICE_COMPILE)
   ulonglong2 raw_;
@@ -104,7 +104,7 @@ union alignas(8) LL8Packet {
   };
   uint64_t raw_;
 
-  using ValueType = uint32_t;
+  using Payload = uint32_t;
 #if defined(MSCCLPP_DEVICE_COMPILE)
 
   MSCCLPP_DEVICE_INLINE LL8Packet() {}
diff --git a/src/errors.cc b/src/errors.cc
index 537b3fc27..fbc7a5734 100644
--- a/src/errors.cc
+++ b/src/errors.cc
@@ -19,6 +19,10 @@ std::string errorToString(enum ErrorCode error) {
       return "InvalidUsage";
     case ErrorCode::Timeout:
       return "Timeout";
+    case ErrorCode::Aborted:
+      return "Aborted";
+    case ErrorCode::ExecutorError:
+      return "ExecutorError";
     default:
       return "UnknownError";
   }
diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc
index 60fb2b438..da7e135a7 100644
--- a/src/executor/execution_plan.cc
+++ b/src/executor/execution_plan.cc
@@ -44,7 +44,7 @@ auto getOpType = [](const std::string& str) {
   } else if (str == "cpkt") {
     return mscclpp::OperationType::COPY_PACKET;
   } else {
-    throw std::runtime_error("Invalid operation type");
+    throw mscclpp::Error("Invalid operation type", mscclpp::ErrorCode::ExecutorError);
   }
 };
 
@@ -56,7 +56,7 @@ auto convertToBufferType = [](const std::string& str) {
   } else if (str == "s") {
     return mscclpp::BufferType::SCRATCH;
   } else {
-    throw std::runtime_error("Invalid buffer type");
+    throw mscclpp::Error("Invalid buffer type", mscclpp::ErrorCode::ExecutorError);
   }
 };
 
@@ -68,7 +68,7 @@ auto convertToChannelType = [](const std::string& str) {
   } else if (str == "none") {
     return mscclpp::ChannelType::NONE;
   } else {
-    throw std::runtime_error("Invalid channel type");
+    throw mscclpp::Error("Invalid channel type", mscclpp::ErrorCode::ExecutorError);
   }
 };
 
@@ -124,7 +124,7 @@ void ExecutionPlan::Impl::loadExecutionPlan(int rank, size_t inputSize) {
   std::ifstream file(this->planPath);
   json obj = json::parse(file);
   if (this->name != obj["name"]) {
-    throw std::runtime_error("Plan name does not match");
+    throw Error("Plan name does not match", ErrorCode::ExecutorError);
   }
   std::string protocol = obj["protocol"];
   if (protocol == "LL") {
diff --git a/src/executor/executor.cc b/src/executor/executor.cc
index 5b1a827c6..fb033a73e 100644
--- a/src/executor/executor.cc
+++ b/src/executor/executor.cc
@@ -142,7 +142,7 @@ struct Executor::Impl {
         case BufferType::SCRATCH:
           return std::make_pair((void*)context.scratchBuffer.get(), context.scratchBufferSize);
         default:
-          throw std::runtime_error("Invalid buffer type");
+          throw Error("Invalid buffer type", ErrorCode::ExecutorError);
       }
     };
     auto getConnectedPeers = [&](std::vector<ChannelInfo>& infos) {
@@ -206,7 +206,7 @@ struct Executor::Impl {
         case BufferType::SCRATCH:
           return (void*)context.scratchBuffer.get();
         default:
-          throw std::runtime_error("Invalid buffer type");
+          throw Error("Invalid buffer type", ErrorCode::ExecutorError);
       }
     };
     for (ChannelType channelType : channelTypes) {
@@ -274,7 +274,7 @@ struct Executor::Impl {
             sharedMemSize, stream, ++flag);
         break;
       default:
-        throw std::runtime_error("Invalid packet type");
+        throw Error("Invalid packet type", ErrorCode::ExecutorError);
     }
   }
 };
diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp
index 9749faf69..08e8796a5 100644
--- a/src/include/execution_kernel.hpp
+++ b/src/include/execution_kernel.hpp
@@ -225,19 +225,19 @@ MSCCLPP_DEVICE_INLINE void handleReduceSendPacket(T* dst, uint32_t dstOffsetByBy
                                                   uint32_t flag) {
   size_t nPackets = size * 2 / sizeof(PacketType);
   const size_t intputBaseOffset = flag & 0x1 ? 0 : inputBuffSize >> 1;
-  const uint32_t srcOffset = srcOffsetByBytes / sizeof(PacketValType<PacketType>);
-  const uint32_t dstOffset = dstOffsetByBytes / sizeof(PacketValType<PacketType>);
-  PacketValType<PacketType>* srcPacketValue = (PacketValType<PacketType>*)src + srcOffset;
-  PacketValType<PacketType>* dstPacketValue = (PacketValType<PacketType>*)dst + dstOffset;
+  const uint32_t srcOffset = srcOffsetByBytes / sizeof(PacketPayload<PacketType>);
+  const uint32_t dstOffset = dstOffsetByBytes / sizeof(PacketPayload<PacketType>);
+  PacketPayload<PacketType>* srcPacketPayload = (PacketPayload<PacketType>*)src + srcOffset;
+  PacketPayload<PacketType>* dstPacketPayload = (PacketPayload<PacketType>*)dst + dstOffset;
   for (size_t idx = threadIdx.x; idx < nPackets; idx += blockDim.x) {
-    PacketValType<PacketType> data = {};
+    PacketPayload<PacketType> data = {};
     for (int index = 0; index < nSrcs; ++index) {
       PacketType* pkt = (PacketType*)((char*)inputBuff + intputBaseOffset + 2 * inputOffsets[index]);
-      PacketValType<PacketType> val = pkt[idx].read(flag);
+      PacketPayload<PacketType> val = pkt[idx].read(flag);
       data = add_vectors<T>(data, val);
     }
-    data = add_vectors<T>(data, srcPacketValue[idx]);
-    dstPacketValue[idx] = data;
+    data = add_vectors<T>(data, srcPacketPayload[idx]);
+    dstPacketPayload[idx] = data;
 
     PacketType pkt(data, flag);
     for (int index = 0; index < nDstChannels; ++index) {
@@ -252,10 +252,10 @@ MSCCLPP_DEVICE_INLINE void handleCopyPacket(void* dst, void* src, size_t srcSize
                                             uint32_t srcOffset, size_t size, uint32_t flag) {
   const size_t outputScratchBaseOffset = flag & 0x1 ? 0 : srcSize >> 1;
   PacketType* srcPackets = (PacketType*)((char*)src + outputScratchBaseOffset + 2 * srcOffset);
-  PacketValType<PacketType>* result = (PacketValType<PacketType>*)((char*)dst + dstOffset);
+  PacketPayload<PacketType>* result = (PacketPayload<PacketType>*)((char*)dst + dstOffset);
   size_t nPackets = size * 2 / sizeof(PacketType);
   for (size_t idx = threadIdx.x; idx < nPackets; idx += blockDim.x) {
-    PacketValType<PacketType> data = srcPackets[idx].read(flag);
+    PacketPayload<PacketType> data = srcPackets[idx].read(flag);
     result[idx] = data;
   }
 }

From d4671bc6b77f012f6c37a59bfe139a99254e5a8d Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Thu, 18 Apr 2024 11:10:19 +0000
Subject: [PATCH 49/51] address comments

---
 include/mscclpp/core.hpp        |  4 ++++
 include/mscclpp/executor.hpp    |  2 +-
 python/mscclpp/comm.py          |  1 +
 python/mscclpp/core_py.cpp      |  1 +
 python/mscclpp/executor.cpp     |  2 +-
 python/test/executor_test.py    |  8 ++------
 python/test/test_mscclpp.py     |  4 ++--
 src/bootstrap/bootstrap.cc      | 25 +++++++++++++++++++++++++
 src/executor/executor.cc        |  6 +++---
 test/executor_test.cc           |  4 ++--
 test/mp_unit/bootstrap_tests.cc |  7 +++++++
 11 files changed, 49 insertions(+), 15 deletions(-)

diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index 132df587a..01b8096a9 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -38,6 +38,7 @@ class Bootstrap {
   virtual ~Bootstrap() = default;
   virtual int getRank() = 0;
   virtual int getNranks() = 0;
+  virtual int getNranksPerNode() = 0;
   virtual void send(void* data, int size, int peer, int tag) = 0;
   virtual void recv(void* data, int size, int peer, int tag) = 0;
   virtual void allGather(void* allData, int size) = 0;
@@ -83,6 +84,9 @@ class TcpBootstrap : public Bootstrap {
   /// Return the total number of ranks.
   int getNranks() override;
 
+  /// Return the total number of ranks per node.
+  int getNranksPerNode() override;
+
   /// Send data to another process.
   ///
   /// Data sent via `send(senderBuff, size, receiverRank, tag)` can be received via `recv(receiverBuff, size,
diff --git a/include/mscclpp/executor.hpp b/include/mscclpp/executor.hpp
index ab54c2596..23dc7cece 100644
--- a/include/mscclpp/executor.hpp
+++ b/include/mscclpp/executor.hpp
@@ -36,7 +36,7 @@ class ExecutionPlan {
 
 class Executor {
  public:
-  Executor(std::shared_ptr<Communicator> comm, int nranksPerNode);
+  Executor(std::shared_ptr<Communicator> comm);
   Executor(const Executor&) = delete;
   Executor& operator=(const Executor&) = delete;
   ~Executor();
diff --git a/python/mscclpp/comm.py b/python/mscclpp/comm.py
index b3cc51f12..1cf9ebb41 100644
--- a/python/mscclpp/comm.py
+++ b/python/mscclpp/comm.py
@@ -51,6 +51,7 @@ def __init__(
         self.communicator = Communicator(self.bootstrap)
         self.my_rank = self.bootstrap.get_rank()
         self.nranks = self.bootstrap.get_n_ranks()
+        self.nranks_per_node = self.bootstrap.get_n_ranks_per_node()
 
     def barrier(self):
         self.bootstrap.barrier()
diff --git a/python/mscclpp/core_py.cpp b/python/mscclpp/core_py.cpp
index 3f78dad35..68da91599 100644
--- a/python/mscclpp/core_py.cpp
+++ b/python/mscclpp/core_py.cpp
@@ -36,6 +36,7 @@ void register_core(nb::module_& m) {
   nb::class_<Bootstrap>(m, "Bootstrap")
       .def("get_rank", &Bootstrap::getRank)
       .def("get_n_ranks", &Bootstrap::getNranks)
+      .def("get_n_ranks_per_node", &Bootstrap::getNranksPerNode)
       .def(
           "send",
           [](Bootstrap* self, uintptr_t ptr, size_t size, int peer, int tag) {
diff --git a/python/mscclpp/executor.cpp b/python/mscclpp/executor.cpp
index 9b411ba8a..9f58eac8f 100644
--- a/python/mscclpp/executor.cpp
+++ b/python/mscclpp/executor.cpp
@@ -24,7 +24,7 @@ void register_executor(nb::module_& m) {
       .def(nb::init<const std::string, const std::string>(), nb::arg("name"), nb::arg("planPath"));
 
   nb::class_<Executor>(m, "Executor")
-      .def(nb::init<std::shared_ptr<Communicator>, int>(), nb::arg("comm"), nb::arg("nranksPerNode"))
+      .def(nb::init<std::shared_ptr<Communicator>>(), nb::arg("comm"))
       .def(
           "execute",
           [](Executor* self, int rank, uintptr_t sendbuff, uintptr_t recvBuff, size_t sendBuffSize, size_t recvBuffSize,
diff --git a/python/test/executor_test.py b/python/test/executor_test.py
index 239ecd4a7..b0e4342dd 100644
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -41,13 +41,9 @@ def bench_time(niters: int, ngraphIters: int, func):
 
 
 if __name__ == "__main__":
-    shm_comm = MPI.COMM_WORLD.Split_type(MPI.COMM_TYPE_SHARED, 0, MPI.INFO_NULL)
-    N_GPUS_PER_NODE = shm_comm.size
-    shm_comm.Free()
-
-    cp.cuda.Device(MPI.COMM_WORLD.rank % N_GPUS_PER_NODE).use()
     mscclpp_group = mscclpp_comm.CommGroup(MPI.COMM_WORLD)
-    executor = Executor(mscclpp_group.communicator, N_GPUS_PER_NODE)
+    cp.cuda.Device(MPI.COMM_WORLD.rank % mscclpp_group.nranks_per_node).use()
+    executor = Executor(mscclpp_group.communicator)
     execution_plan = ExecutionPlan(
         "allreduce_pairs", path.join(MSCCLPP_ROOT_PATH, "test", "execution-files", "allreduce.json")
     )
diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py
index f007718e3..c6014b84e 100644
--- a/python/test/test_mscclpp.py
+++ b/python/test/test_mscclpp.py
@@ -28,7 +28,7 @@
 import mscclpp.comm as mscclpp_comm
 from mscclpp.utils import KernelBuilder, pack
 from ._cpp import _ext
-from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group, N_GPUS_PER_NODE
+from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group
 
 ethernet_interface_name = "eth0"
 
@@ -602,7 +602,7 @@ def test_executor(mpi_group: MpiGroup, filename: str):
         pytest.skip("algo not support cross node")
     project_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
     mscclpp_group = mscclpp_comm.CommGroup(mpi_group.comm)
-    executor = Executor(mscclpp_group.communicator, N_GPUS_PER_NODE)
+    executor = Executor(mscclpp_group.communicator)
     execution_plan = ExecutionPlan("allreduce_pairs", os.path.join(project_dir, "test", "execution-files", filename))
 
     nelems = 1024 * 1024
diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc
index c9cea10f4..d6e9a0dfb 100644
--- a/src/bootstrap/bootstrap.cc
+++ b/src/bootstrap/bootstrap.cc
@@ -81,6 +81,7 @@ class TcpBootstrap::Impl {
   UniqueId getUniqueId() const;
   int getRank();
   int getNranks();
+  int getNranksPerNode();
   void allGather(void* allData, int size);
   void send(void* data, int size, int peer, int tag);
   void recv(void* data, int size, int peer, int tag);
@@ -91,6 +92,7 @@ class TcpBootstrap::Impl {
   UniqueIdInternal uniqueId_;
   int rank_;
   int nRanks_;
+  int nRanksPerNode_;
   bool netInitialized;
   std::unique_ptr<Socket> listenSockRoot_;
   std::unique_ptr<Socket> listenSock_;
@@ -141,6 +143,7 @@ UniqueId TcpBootstrap::Impl::getUniqueId(const UniqueIdInternal& uniqueId) {
 TcpBootstrap::Impl::Impl(int rank, int nRanks)
     : rank_(rank),
       nRanks_(nRanks),
+      nRanksPerNode_(0),
       netInitialized(false),
       peerCommAddresses_(nRanks, SocketAddress()),
       barrierArr_(nRanks, 0),
@@ -418,6 +421,26 @@ void TcpBootstrap::Impl::establishConnections(int64_t timeoutSec) {
   TRACE(MSCCLPP_INIT, "rank %d nranks %d - DONE", rank_, nRanks_);
 }
 
+int TcpBootstrap::Impl::getNranksPerNode() {
+  if (nRanksPerNode_ > 0) return nRanksPerNode_;
+  int nRanksPerNode = 0;
+  bool useIpv4 = peerCommAddresses_[rank_].sa.sa_family == AF_INET;
+  for (int i = 0; i < nRanks_; i++) {
+    if (useIpv4) {
+      if (peerCommAddresses_[i].sin.sin_addr.s_addr == peerCommAddresses_[rank_].sin.sin_addr.s_addr) {
+        nRanksPerNode++;
+      }
+    } else {
+      if (std::memcmp(&(peerCommAddresses_[i].sin6.sin6_addr), &(peerCommAddresses_[rank_].sin6.sin6_addr),
+                      sizeof(in6_addr)) == 0) {
+        nRanksPerNode++;
+      }
+    }
+  }
+  nRanksPerNode_ = nRanksPerNode;
+  return nRanksPerNode_;
+}
+
 void TcpBootstrap::Impl::allGather(void* allData, int size) {
   char* data = static_cast<char*>(allData);
   int rank = rank_;
@@ -520,6 +543,8 @@ MSCCLPP_API_CPP int TcpBootstrap::getRank() { return pimpl_->getRank(); }
 
 MSCCLPP_API_CPP int TcpBootstrap::getNranks() { return pimpl_->getNranks(); }
 
+MSCCLPP_API_CPP int TcpBootstrap::getNranksPerNode() { return pimpl_->getNranksPerNode(); }
+
 MSCCLPP_API_CPP void TcpBootstrap::send(void* data, int size, int peer, int tag) {
   pimpl_->send(data, size, peer, tag);
 }
diff --git a/src/executor/executor.cc b/src/executor/executor.cc
index fb033a73e..2f4fdd264 100644
--- a/src/executor/executor.cc
+++ b/src/executor/executor.cc
@@ -72,7 +72,8 @@ struct Executor::Impl {
   std::shared_ptr<ProxyService> proxyService;
   std::unordered_map<ExecutionContextKey, ExecutionContext> contexts;
 
-  Impl(std::shared_ptr<Communicator> comm, int nranksPerNode) : nranksPerNode(nranksPerNode), comm(comm) {
+  Impl(std::shared_ptr<Communicator> comm) : comm(comm) {
+    this->nranksPerNode = comm->bootstrap()->getNranksPerNode();
     this->proxyService = std::make_shared<ProxyService>();
   }
   ~Impl() = default;
@@ -279,8 +280,7 @@ struct Executor::Impl {
   }
 };
 
-Executor::Executor(std::shared_ptr<Communicator> comm, int nranksPerNode)
-    : impl_(std::make_unique<Impl>(comm, nranksPerNode)) {}
+Executor::Executor(std::shared_ptr<Communicator> comm) : impl_(std::make_unique<Impl>(comm)) {}
 
 void Executor::execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize,
                        DataType dataType, int nthreads, const ExecutionPlan& plan, cudaStream_t stream,
diff --git a/test/executor_test.cc b/test/executor_test.cc
index 213b3cdb6..fafa71412 100644
--- a/test/executor_test.cc
+++ b/test/executor_test.cc
@@ -27,13 +27,13 @@ int main() {
     id = bootstrap->createUniqueId();
   }
   MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
-  bootstrap->initialize(id);
   // sleep 20s
   // std::this_thread::sleep_for(std::chrono::seconds(20));
+  bootstrap->initialize(id);
   auto comm = std::make_shared<mscclpp::Communicator>(bootstrap);
   CUDACHECK(cudaSetDevice(rank));
 
-  std::shared_ptr<mscclpp::Executor> executor = std::make_shared<mscclpp::Executor>(comm, 8 /*nranksPerNode*/);
+  std::shared_ptr<mscclpp::Executor> executor = std::make_shared<mscclpp::Executor>(comm);
   mscclpp::ExecutionPlan plan("allreduce_pairs", MSCCLPP_ROOT_PATH + "/test/execution-files/allreduce.json");
   const int bufferSize = 1024 * 1024;
   std::shared_ptr<char> sendbuff = mscclpp::allocExtSharedCuda<char>(bufferSize);
diff --git a/test/mp_unit/bootstrap_tests.cc b/test/mp_unit/bootstrap_tests.cc
index 69e566dbd..65ec17027 100644
--- a/test/mp_unit/bootstrap_tests.cc
+++ b/test/mp_unit/bootstrap_tests.cc
@@ -120,6 +120,13 @@ class MPIBootstrap : public mscclpp::Bootstrap {
     MPI_Comm_size(MPI_COMM_WORLD, &worldSize);
     return worldSize;
   }
+  int getNranksPerNode() override {
+    MPI_Comm shmcomm;
+    MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmcomm);
+    int shmrank;
+    MPI_Comm_size(shmcomm, &shmrank);
+    return shmrank;
+  }
   void allGather(void* sendbuf, int size) override {
     MPI_Allgather(MPI_IN_PLACE, 0, MPI_BYTE, sendbuf, size, MPI_BYTE, MPI_COMM_WORLD);
   }

From 149eb416f25de1c5fcf4848cd1efede6ab333c10 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Thu, 18 Apr 2024 13:53:53 +0000
Subject: [PATCH 50/51] address comments

---
 test/CMakeLists.txt            |  1 -
 test/executor_test.cc          | 47 ---------------------------
 test/mp_unit/CMakeLists.txt    |  1 +
 test/mp_unit/executor_tests.cc | 58 ++++++++++++++++++++++++++++++++++
 test/mp_unit/mp_unit_tests.hpp |  8 +++++
 5 files changed, 67 insertions(+), 48 deletions(-)
 delete mode 100644 test/executor_test.cc
 create mode 100644 test/mp_unit/executor_tests.cc

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 501f96ab0..da47066ea 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -24,7 +24,6 @@ endfunction()
 add_test_executable(allgather_test_cpp allgather_test_cpp.cu)
 add_test_executable(allgather_test_host_offloading allgather_test_host_offloading.cu)
 add_test_executable(nvls_test nvls_test.cu)
-add_test_executable(executor_test executor_test.cc)
 
 configure_file(run_mpi_test.sh.in run_mpi_test.sh)
 
diff --git a/test/executor_test.cc b/test/executor_test.cc
deleted file mode 100644
index fafa71412..000000000
--- a/test/executor_test.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-#include <mpi.h>
-
-#include <mscclpp/executor.hpp>
-#include <mscclpp/gpu_utils.hpp>
-
-// Check CUDA RT calls
-#define CUDACHECK(cmd)                                                                  \
-  do {                                                                                  \
-    cudaError_t err = cmd;                                                              \
-    if (err != cudaSuccess) {                                                           \
-      printf("%s:%d Cuda failure '%s'\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
-      exit(EXIT_FAILURE);                                                               \
-    }                                                                                   \
-  } while (false)
-
-const std::string MSCCLPP_ROOT_PATH = "/root/mscclpp";
-
-int main() {
-  int rank;
-  int world_size;
-  MPI_Init(NULL, NULL);
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &world_size);
-  auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(rank, world_size);
-  mscclpp::UniqueId id;
-  if (rank == 0) {
-    id = bootstrap->createUniqueId();
-  }
-  MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
-  // sleep 20s
-  // std::this_thread::sleep_for(std::chrono::seconds(20));
-  bootstrap->initialize(id);
-  auto comm = std::make_shared<mscclpp::Communicator>(bootstrap);
-  CUDACHECK(cudaSetDevice(rank));
-
-  std::shared_ptr<mscclpp::Executor> executor = std::make_shared<mscclpp::Executor>(comm);
-  mscclpp::ExecutionPlan plan("allreduce_pairs", MSCCLPP_ROOT_PATH + "/test/execution-files/allreduce.json");
-  const int bufferSize = 1024 * 1024;
-  std::shared_ptr<char> sendbuff = mscclpp::allocExtSharedCuda<char>(bufferSize);
-  mscclpp::CudaStreamWithFlags stream(cudaStreamNonBlocking);
-  executor->execute(rank, sendbuff.get(), sendbuff.get(), bufferSize, bufferSize, mscclpp::DataType::FLOAT16, 512, plan,
-                    stream);
-  CUDACHECK(cudaStreamSynchronize(stream));
-
-  MPI_Finalize();
-  return 0;
-}
diff --git a/test/mp_unit/CMakeLists.txt b/test/mp_unit/CMakeLists.txt
index dc388844f..8e37d2405 100644
--- a/test/mp_unit/CMakeLists.txt
+++ b/test/mp_unit/CMakeLists.txt
@@ -8,4 +8,5 @@ target_sources(mp_unit_tests PRIVATE
     communicator_tests.cu
     proxy_channel_tests.cu
     sm_channel_tests.cu
+    executor_tests.cc
 )
diff --git a/test/mp_unit/executor_tests.cc b/test/mp_unit/executor_tests.cc
new file mode 100644
index 000000000..6bfe09516
--- /dev/null
+++ b/test/mp_unit/executor_tests.cc
@@ -0,0 +1,58 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include <mpi.h>
+
+#include <filesystem>
+
+#include "mp_unit_tests.hpp"
+
+namespace {
+std::string getExecutablePath() {
+  char result[PATH_MAX];
+  ssize_t count = readlink("/proc/self/exe", result, PATH_MAX);
+  if (count == -1) {
+    throw std::runtime_error("Failed to get executable path");
+  }
+  return std::string(result, count);
+}
+}  // namespace
+
+void ExecutorTest::SetUp() {
+  MultiProcessTest::SetUp();
+
+  MSCCLPP_CUDATHROW(cudaSetDevice(rankToLocalRank(gEnv->rank)));
+  std::shared_ptr<mscclpp::TcpBootstrap> bootstrap;
+  mscclpp::UniqueId id;
+  if (gEnv->rank < gEnv->worldSize) {
+    bootstrap = std::make_shared<mscclpp::TcpBootstrap>(gEnv->rank, gEnv->worldSize);
+    if (gEnv->rank == 0) id = bootstrap->createUniqueId();
+  }
+  MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
+  bootstrap->initialize(id);
+  std::shared_ptr<mscclpp::Communicator> communicator = std::make_shared<mscclpp::Communicator>(bootstrap);
+  executor = std::make_shared<mscclpp::Executor>(communicator);
+}
+
+void ExecutorTest::TearDown() {
+  executor.reset();
+  MultiProcessTest::TearDown();
+}
+
+TEST_F(ExecutorTest, TwoNodesAllreduce) {
+  if (gEnv->worldSize != 2 || gEnv->nRanksPerNode != 2) {
+    GTEST_SKIP() << "This test requires world size to be 2 and ranks per node to be 2";
+    return;
+  }
+  std::string executablePath = getExecutablePath();
+  std::filesystem::path path = executablePath;
+  std::filesystem::path executionFilesPath =
+      path.parent_path().parent_path().parent_path() / "test/execution-files/allreduce.json";
+  mscclpp::ExecutionPlan plan("allreduce_pairs", executionFilesPath.string());
+  const int bufferSize = 1024 * 1024;
+  std::shared_ptr<char> sendbuff = mscclpp::allocExtSharedCuda<char>(bufferSize);
+  mscclpp::CudaStreamWithFlags stream(cudaStreamNonBlocking);
+  executor->execute(gEnv->rank, sendbuff.get(), sendbuff.get(), bufferSize, bufferSize, mscclpp::DataType::FLOAT16, 512,
+                    plan, stream);
+  MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
+}
diff --git a/test/mp_unit/mp_unit_tests.hpp b/test/mp_unit/mp_unit_tests.hpp
index e934dee49..6cb159c67 100644
--- a/test/mp_unit/mp_unit_tests.hpp
+++ b/test/mp_unit/mp_unit_tests.hpp
@@ -7,6 +7,7 @@
 #include <gtest/gtest.h>
 
 #include <mscclpp/core.hpp>
+#include <mscclpp/executor.hpp>
 #include <mscclpp/packet_device.hpp>
 #include <mscclpp/proxy_channel.hpp>
 #include <mscclpp/sm_channel.hpp>
@@ -155,4 +156,11 @@ class SmChannelOneToOneTest : public CommunicatorTestBase {
   std::unordered_map<int, std::shared_ptr<mscclpp::SmDevice2DeviceSemaphore>> smSemaphores;
 };
 
+class ExecutorTest : public MultiProcessTest {
+ protected:
+  void SetUp() override;
+  void TearDown() override;
+
+  std::shared_ptr<mscclpp::Executor> executor;
+};
 #endif  // MSCCLPP_MP_UNIT_TESTS_HPP_

From c407f29f9d420f81540ef0e11a6267530c886569 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 18 Apr 2024 18:16:29 +0000
Subject: [PATCH 51/51] Python binding for ExecutorError

---
 python/mscclpp/error_py.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/mscclpp/error_py.cpp b/python/mscclpp/error_py.cpp
index 18d4b834a..af78ac880 100644
--- a/python/mscclpp/error_py.cpp
+++ b/python/mscclpp/error_py.cpp
@@ -16,7 +16,8 @@ void register_error(nb::module_& m) {
       .value("RemoteError", ErrorCode::RemoteError)
       .value("InvalidUsage", ErrorCode::InvalidUsage)
       .value("Timeout", ErrorCode::Timeout)
-      .value("Aborted", ErrorCode::Aborted);
+      .value("Aborted", ErrorCode::Aborted)
+      .value("ExecutorError", ErrorCode::ExecutorError);
 
   nb::class_<BaseError>(m, "BaseError")
       .def(nb::init<std::string&, int>(), nb::arg("message"), nb::arg("errorCode"))