microsoft · chhwang · Apr 18, 2024 · Mar 28, 2024 · Mar 29, 2024 · Mar 29, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -99,13 +99,17 @@ find_package(IBVerbs REQUIRED)
 find_package(NUMA REQUIRED)
 find_package(Threads REQUIRED)
 
+include(FetchContent)
+FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz)
+FetchContent_MakeAvailable(json)
+
 add_library(mscclpp_obj OBJECT)
 target_include_directories(mscclpp_obj
     SYSTEM PRIVATE
     ${GPU_INCLUDE_DIRS}
     ${IBVERBS_INCLUDE_DIRS}
     ${NUMA_INCLUDE_DIRS})
-target_link_libraries(mscclpp_obj PRIVATE ${GPU_LIBRARIES} ${NUMA_LIBRARIES} ${IBVERBS_LIBRARIES} Threads::Threads)
+target_link_libraries(mscclpp_obj PRIVATE ${GPU_LIBRARIES} ${NUMA_LIBRARIES} ${IBVERBS_LIBRARIES} nlohmann_json::nlohmann_json Threads::Threads)
 set_target_properties(mscclpp_obj PROPERTIES LINKER_LANGUAGE CXX POSITION_INDEPENDENT_CODE 1 VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
 if(USE_CUDA)
     target_compile_definitions(mscclpp_obj PRIVATE USE_CUDA)

diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
@@ -760,6 +760,10 @@ DeviceHandle<std::remove_reference_t<T>> deviceHandle(T&& t) {
   return t.deviceHandle();
 }
 
+/// Packet value type.
+template <class T>
+using PacketValType = typename T::ValueType;
+
 }  // namespace mscclpp
 
 namespace std {

diff --git a/include/mscclpp/executor.hpp b/include/mscclpp/executor.hpp
@@ -0,0 +1,53 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef MSCCLPP_EXECUTOR_HPP_
+#define MSCCLPP_EXECUTOR_HPP_
+
+#include <memory>
+#include <mscclpp/core.hpp>
+#include <unordered_map>
+
+namespace mscclpp {
+
+enum class DataType {
+  INT32,
+  UINT32,
+  FLOAT16,
+  FLOAT32,
+};
+
+enum class PacketType {
+  LL8,
+  LL16,
+};
+
+class ExecutionPlan {
+ public:
+  ExecutionPlan(const std::string& name, const std::string& planPath);
+  ~ExecutionPlan() = default;
+
+ private:
+  struct Impl;
+  std::shared_ptr<Impl> impl_;
+
+  friend class Executor;
+};
+
+class Executor {
+ public:
+  Executor(std::shared_ptr<Communicator> comm, int nranksPerNode);
+  Executor(const Executor&) = delete;
+  Executor& operator=(const Executor&) = delete;
+  ~Executor();
+
+  void execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, DataType dataType,
+               int nthreads, const ExecutionPlan& plan, cudaStream_t stream, PacketType packetType = PacketType::LL16);
+
+ private:
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
+};
+}  // namespace mscclpp
+
+#endif  // MSCCLPP_EXECUTOR_HPP_
diff --git a/include/mscclpp/packet_device.hpp b/include/mscclpp/packet_device.hpp
@@ -24,12 +24,20 @@ union alignas(16) LL16Packet {
     uint32_t data2;
     uint32_t flag2;
   };
+  using ValueType = uint2;
 
 #if defined(MSCCLPP_DEVICE_COMPILE)
   ulonglong2 raw_;
 
   MSCCLPP_DEVICE_INLINE LL16Packet() {}
 
+  MSCCLPP_DEVICE_INLINE LL16Packet(uint2 val, uint32_t flag) {
+    data1 = val.x;
+    flag1 = flag;
+    data2 = val.y;
+    flag2 = flag;
+  }
+
   /// Write 8 bytes of data to the packet.
   /// @param val1 The first 4-byte data to write.
   /// @param val2 The second 4-byte data to write.
@@ -95,10 +103,17 @@ union alignas(8) LL8Packet {
     uint32_t flag;
   };
   uint64_t raw_;
+
+  using ValueType = uint32_t;
 #if defined(MSCCLPP_DEVICE_COMPILE)
 
   MSCCLPP_DEVICE_INLINE LL8Packet() {}
 
+  MSCCLPP_DEVICE_INLINE LL8Packet(uint32_t val, uint32_t flag) {
+    this->data = val;
+    this->flag = flag;
+  }
+
   MSCCLPP_DEVICE_INLINE void write(uint32_t val, uint32_t flag) {
 #if defined(MSCCLPP_DEVICE_CUDA)
     asm volatile("st.volatile.global.v2.u32 [%0], {%1,%2};" ::"l"(&raw_), "r"(val), "r"(flag));

diff --git a/python/mscclpp/__init__.py b/python/mscclpp/__init__.py
@@ -19,6 +19,10 @@
     TcpBootstrap,
     Transport,
     TransportFlags,
+    DataType,
+    Executor,
+    ExecutionPlan,
+    PacketType,
     version,
     is_nvls_supported,
 )

diff --git a/python/mscclpp/core_py.cpp b/python/mscclpp/core_py.cpp
@@ -20,6 +20,7 @@ extern void register_fifo(nb::module_& m);
 extern void register_semaphore(nb::module_& m);
 extern void register_utils(nb::module_& m);
 extern void register_numa(nb::module_& m);
+extern void register_executor(nb::module_& m);
 
 template <typename T>
 void def_nonblocking_future(nb::handle& m, const std::string& typestr) {
@@ -204,4 +205,5 @@ NB_MODULE(_mscclpp, m) {
   register_utils(m);
   register_core(m);
   register_numa(m);
+  register_executor(m);
 }
diff --git a/python/mscclpp/executor.cpp b/python/mscclpp/executor.cpp
@@ -0,0 +1,38 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/shared_ptr.h>
+#include <nanobind/stl/string.h>
+
+#include <mscclpp/executor.hpp>
+#include <mscclpp/gpu.hpp>
+
+namespace nb = nanobind;
+using namespace mscclpp;
+
+void register_executor(nb::module_& m) {
+  nb::enum_<DataType>(m, "DataType")
+      .value("int32", DataType::INT32)
+      .value("uint32", DataType::UINT32)
+      .value("float16", DataType::FLOAT16)
+      .value("float32", DataType::FLOAT32);
+
+  nb::enum_<PacketType>(m, "PacketType").value("LL8", PacketType::LL8).value("LL16", PacketType::LL16);
+
+  nb::class_<ExecutionPlan>(m, "ExecutionPlan")
+      .def(nb::init<const std::string, const std::string>(), nb::arg("name"), nb::arg("planPath"));
+
+  nb::class_<Executor>(m, "Executor")
+      .def(nb::init<std::shared_ptr<Communicator>, int>(), nb::arg("comm"), nb::arg("nranksPerNode"))
+      .def(
+          "execute",
+          [](Executor* self, int rank, uintptr_t sendbuff, uintptr_t recvBuff, size_t sendBuffSize, size_t recvBuffSize,
+             DataType dataType, int nthreads, const ExecutionPlan& plan, uintptr_t stream, PacketType packetType) {
+            self->execute(rank, reinterpret_cast<void*>(sendbuff), reinterpret_cast<void*>(recvBuff), sendBuffSize,
+                          recvBuffSize, dataType, nthreads, plan, (cudaStream_t)stream, packetType);
+          },
+          nb::arg("rank"), nb::arg("sendbuff"), nb::arg("recvBuff"), nb::arg("sendBuffSize"), nb::arg("recvBuffSize"),
+          nb::arg("dataType"), nb::arg("nthreads"), nb::arg("plan"), nb::arg("stream"),
+          nb::arg("packetType") = PacketType::LL16);
+}
diff --git a/python/test/executor_test.py b/python/test/executor_test.py
@@ -0,0 +1,77 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from os import path
+from mscclpp import (
+    DataType,
+    Executor,
+    ExecutionPlan,
+)
+import mscclpp.comm as mscclpp_comm
+
+import cupy as cp
+from mpi4py import MPI
+
+MSCCLPP_ROOT_PATH = "/root/mscclpp"
+
+
+def bench_time(niters: int, func):
+    # capture cuda graph for niters of the kernel launch
+    stream = cp.cuda.Stream(non_blocking=True)
+    with stream:
+        stream.begin_capture()
+        for i in range(niters):
+            func(stream)
+        graph = stream.end_capture()
+
+    # now run a warm up round
+    graph.launch(stream)
+
+    # now run the benchmark and measure time
+    start = cp.cuda.Event()
+    end = cp.cuda.Event()
+
+    start.record(stream)
+    graph.launch(stream)
+    end.record(stream)
+    end.synchronize()
+
+    return cp.cuda.get_elapsed_time(start, end) / niters * 1000.0
+
+
+if __name__ == "__main__":
+    shm_comm = MPI.COMM_WORLD.Split_type(MPI.COMM_TYPE_SHARED, 0, MPI.INFO_NULL)
+    N_GPUS_PER_NODE = shm_comm.size
+    shm_comm.Free()
+
+    cp.cuda.Device(MPI.COMM_WORLD.rank % N_GPUS_PER_NODE).use()
+    mscclpp_group = mscclpp_comm.CommGroup(MPI.COMM_WORLD)
+    executor = Executor(mscclpp_group.communicator, N_GPUS_PER_NODE)
+    execution_plan = ExecutionPlan(
+        "allreduce_pairs", path.join(MSCCLPP_ROOT_PATH, "test", "execution-files", "allreduce.json")
+    )
+
+    nelems = 1024 * 1024
+    cp.random.seed(42)
+    buffer = cp.random.random(nelems).astype(cp.float16)
+    sub_arrays = cp.split(buffer, MPI.COMM_WORLD.size)
+    sendbuf = sub_arrays[MPI.COMM_WORLD.rank]
+    mscclpp_group.barrier()
+
+    execution_time = bench_time(
+        1000,
+        lambda stream: executor.execute(
+            MPI.COMM_WORLD.rank,
+            sendbuf.data.ptr,
+            sendbuf.data.ptr,
+            sendbuf.nbytes,
+            sendbuf.nbytes,
+            DataType.float16,
+            512,
+            execution_plan,
+            stream.ptr,
+        ),
+    )
+    print(f"Execution time: {execution_time} us, data size: {sendbuf.nbytes} bytes")
+    executor = None
+    mscclpp_group = None
diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py
@@ -12,7 +12,10 @@
 import pytest
 
 from mscclpp import (
+    DataType,
     EndpointConfig,
+    ExecutionPlan,
+    Executor,
     Fifo,
     Host2DeviceSemaphore,
     Host2HostSemaphore,
@@ -25,7 +28,7 @@
 import mscclpp.comm as mscclpp_comm
 from mscclpp.utils import KernelBuilder, pack
 from ._cpp import _ext
-from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group
+from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group, N_GPUS_PER_NODE
 
 ethernet_interface_name = "eth0"
 
@@ -590,3 +593,39 @@ def test_nvls(mpi_group: MpiGroup):
     kernel()
     cp.cuda.runtime.deviceSynchronize()
     group.barrier()
+
+
+@parametrize_mpi_groups(2)
+@pytest.mark.parametrize("filename", ["allreduce.json", "allreduce_packet.json"])
+def test_executor(mpi_group: MpiGroup, filename: str):
+    if all_ranks_on_the_same_node(mpi_group) is False:
+        pytest.skip("algo not support cross node")
+    project_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    mscclpp_group = mscclpp_comm.CommGroup(mpi_group.comm)
+    executor = Executor(mscclpp_group.communicator, N_GPUS_PER_NODE)
+    execution_plan = ExecutionPlan("allreduce_pairs", os.path.join(project_dir, "test", "execution-files", filename))
+
+    nelems = 1024 * 1024
+    cp.random.seed(42)
+    buffer = cp.random.random(nelems).astype(cp.float16)
+    sub_arrays = cp.split(buffer, mpi_group.comm.size)
+    sendbuf = sub_arrays[mpi_group.comm.rank]
+    expected = cp.zeros_like(sendbuf)
+    for i in range(mpi_group.comm.size):
+        expected += sub_arrays[i]
+    mscclpp_group.barrier()
+
+    stream = cp.cuda.Stream(non_blocking=True)
+    executor.execute(
+        mpi_group.comm.rank,
+        sendbuf.data.ptr,
+        sendbuf.data.ptr,
+        sendbuf.nbytes,
+        sendbuf.nbytes,
+        DataType.float16,
+        512,
+        execution_plan,
+        stream.ptr,
+    )
+    stream.synchronize()
+    assert cp.allclose(sendbuf, expected, atol=1e-3 * mpi_group.comm.size)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -1,6 +1,6 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc)
+file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc *.cu)
 target_sources(mscclpp_obj PRIVATE ${SOURCES})
 target_include_directories(mscclpp_obj PRIVATE include)
diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu
@@ -0,0 +1,42 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "execution_kernel.hpp"
+
+#if defined(MSCCLPP_DEVICE_CUDA)
+namespace mscclpp {
+
+template <typename PacketType>
+void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch,
+                                   size_t scratchSize, DataType dataType, DeviceExecutionPlan* plan,
+                                   size_t sharedMemSize, cudaStream_t stream, uint32_t flag) {
+  switch (dataType) {
+    case DataType::INT32:
+      executionKernel<int32_t, PacketType><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
+          rank, (int32_t*)src, (int32_t*)dst, (int32_t*)scratch, scratchSize, plan, flag);
+      break;
+    case DataType::UINT32:
+      executionKernel<uint32_t><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
+          rank, (uint32_t*)src, (uint32_t*)dst, (uint32_t*)scratch, scratchSize, plan, flag);
+      break;
+    case DataType::FLOAT16:
+      executionKernel<half><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
+          rank, (half*)src, (half*)dst, (half*)scratch, scratchSize, plan, flag);
+      break;
+    case DataType::FLOAT32:
+      executionKernel<float><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
+          rank, (float*)src, (float*)dst, (float*)scratch, scratchSize, plan, flag);
+      break;
+  }
+}
+
+template void ExecutionKernel::launchKernel<LL16Packet>(int rank, int nthreadblocks, int nthreads, void* src, void* dst,
+                                                        void* scratch, size_t scratchSize, DataType dataType,
+                                                        DeviceExecutionPlan* plan, size_t sharedMemSize,
+                                                        cudaStream_t stream, uint32_t flag);
+template void ExecutionKernel::launchKernel<LL8Packet>(int rank, int nthreadblocks, int nthreads, void* src, void* dst,
+                                                       void* scratch, size_t scratchSize, DataType dataType,
+                                                       DeviceExecutionPlan* plan, size_t sharedMemSize,
+                                                       cudaStream_t stream, uint32_t flag);
+}  // namespace mscclpp
+#endif