microsoft · chhwang · Mar 27, 2024 · Mar 26, 2024 · Mar 26, 2024 · Mar 26, 2024
diff --git a/CITATION.cff b/CITATION.cff
@@ -1,6 +1,6 @@
 cff-version: 1.2.0
 title: "MSCCL++: A GPU-driven communication stack for scalable AI applications"
-version: 0.4.2
+version: 0.4.3
 message: >-
   If you use this project in your research, please cite it as below.
 authors:
@@ -31,6 +31,9 @@ authors:
   - given-names: Olli
     family-names: Saarikivi
     affiliation: Microsoft Research
+  - given-names: Aashaka
+    family-names: Shah
+    affiliation: Microsoft Research
   - given-names: Wei
     family-names: Tsui
     affiliation: Microsoft Research

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -3,7 +3,7 @@
 
 set(MSCCLPP_MAJOR "0")
 set(MSCCLPP_MINOR "4")
-set(MSCCLPP_PATCH "2")
+set(MSCCLPP_PATCH "3")
 
 set(MSCCLPP_SOVERSION ${MSCCLPP_MAJOR})
 set(MSCCLPP_VERSION "${MSCCLPP_MAJOR}.${MSCCLPP_MINOR}.${MSCCLPP_PATCH}")
@@ -101,7 +101,7 @@ find_package(Threads REQUIRED)
 
 add_library(mscclpp_obj OBJECT)
 target_include_directories(mscclpp_obj
-    PRIVATE
+    SYSTEM PRIVATE
     ${GPU_INCLUDE_DIRS}
     ${IBVERBS_INCLUDE_DIRS}
     ${NUMA_INCLUDE_DIRS})

diff --git a/docker/base-dev-x.dockerfile b/docker/base-dev-x.dockerfile
@@ -27,8 +27,8 @@ ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}"
 ADD . /tmp/mscclpp
 WORKDIR /tmp/mscclpp
 ARG TARGET="cuda12.1"
-RUN cuda_major_version=$(echo ${TARGET} | grep -oP 'cuda\K[0-9]+') && \
-    python3 -m pip install --no-cache-dir -r python/requirements_cu${cuda_major_version}.txt
+RUN target_type=$(echo $TARGET | sed 's/\.[0-9]*$//') && \
+    python3 -m pip install --no-cache-dir -r python/requirements_${target_type}.txt
 
 # Set PATH
 RUN echo PATH="${PATH}" > /etc/environment

diff --git a/docker/build.sh b/docker/build.sh
@@ -7,20 +7,22 @@ baseImageTable=(
     ["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu20.04"
     ["cuda12.1"]="nvidia/cuda:12.1.1-devel-ubuntu20.04"
     ["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04"
+    ["cuda12.3"]="nvidia/cuda:12.3.2-devel-ubuntu20.04"
 )
 
 declare -A extraLdPathTable
 extraLdPathTable=(
     ["cuda11.8"]="/usr/local/cuda-11.8/lib64"
     ["cuda12.1"]="/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64"
     ["cuda12.2"]="/usr/local/cuda-12.2/compat:/usr/local/cuda-12.2/lib64"
+    ["cuda12.3"]="/usr/local/cuda-12.3/compat:/usr/local/cuda-12.3/lib64"
 )
 
 GHCR="ghcr.io/microsoft/mscclpp/mscclpp"
 TARGET=${1}
 
 print_usage() {
-    echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2]"
+    echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|cuda12.3]"
 }
 
 if [[ ! -v "baseImageTable[${TARGET}]" ]]; then

diff --git a/docs/conf.py b/docs/conf.py
@@ -9,7 +9,7 @@
 project = "mscclpp"
 copyright = "2023, MSCCL++ Team"
 author = "MSCCL++ Team"
-release = "v0.4.2"
+release = "v0.4.3"
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration

diff --git a/docs/quickstart.md b/docs/quickstart.md
@@ -11,11 +11,11 @@
     * NVIDIA A100 GPUs + CUDA >= 11.8
     * NVIDIA H100 GPUs + CUDA >= 12.0
     * AMD MI250X GPUs + ROCm >= 5.7
-    * AMD MI300X GPUs + ROCm >= 5.7
+    * AMD MI300X GPUs + ROCm >= 6.0
 * OS: tested over Ubuntu 18.04 and 20.04
 * Libraries: [libnuma](https://github.com/numactl/numactl), MPI (optional)
 * Others
-    * `nvidia_peermem` driver should be loaded on all nodes. Check it via:
+    * For NVIDIA platforms, `nvidia_peermem` driver should be loaded on all nodes. Check it via:
         ```
         lsmod | grep nvidia_peermem
         ```
@@ -59,15 +59,18 @@ $ sudo make install/fast
 Python 3.8 or later is required.
 
 ```bash
+# For NVIDIA platforms
 $ python -m pip install .
+# For AMD platforms
+$ CXX=/path/to/hipcc python -m pip install .
 ```
 
 ## Docker Images
 
 Our base image installs all prerequisites for MSCCL++.
 
 ```bash
-$ docker pull ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1
+$ docker pull ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.3
 ```
 
 See all available images [here](https://github.com/microsoft/mscclpp/pkgs/container/mscclpp%2Fmscclpp).
@@ -101,8 +104,8 @@ $ mpirun -np 16 -npernode 8 -hostfile hostfile ./test/mp_unit_tests -ip_port 10.
 [Install the MSCCL++ Python package](https://github.com/microsoft/mscclpp/blob/chhwang/docs/docs/quickstart.md#install-from-source-python-module) and run our Python AllReduce benchmark as follows. It requires MPI on the system.
 
 ```bash
-# Choose either `requirements_cu11.txt` or `requirements_cu12.txt` according to your CUDA version.
-$ python3 -m pip install -r ./python/requirements_cu12.txt
+# Choose `requirements_*.txt` according to your CUDA/ROCm version.
+$ python3 -m pip install -r ./python/requirements_cuda12.txt
 $ mpirun -tag-output -np 8 python3 ./python/benchmark/allreduce_bench.py
 ```
 

diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
@@ -6,7 +6,7 @@
 
 #define MSCCLPP_MAJOR 0
 #define MSCCLPP_MINOR 4
-#define MSCCLPP_PATCH 2
+#define MSCCLPP_PATCH 3
 #define MSCCLPP_VERSION (MSCCLPP_MAJOR * 10000 + MSCCLPP_MINOR * 100 + MSCCLPP_PATCH)
 
 #include <array>
@@ -51,6 +51,10 @@ class Bootstrap {
 /// A native implementation of the bootstrap using TCP sockets.
 class TcpBootstrap : public Bootstrap {
  public:
+  /// Create a random unique ID.
+  /// @return The created unique ID.
+  static UniqueId createUniqueId();
+
   /// Constructor.
   /// @param rank The rank of the process.
   /// @param nRanks The total number of ranks.
@@ -59,10 +63,6 @@ class TcpBootstrap : public Bootstrap {
   /// Destructor.
   ~TcpBootstrap();
 
-  /// Create a random unique ID and store it in the @ref TcpBootstrap.
-  /// @return The created unique ID.
-  UniqueId createUniqueId();
-
   /// Return the unique ID stored in the @ref TcpBootstrap.
   /// @return The unique ID stored in the @ref TcpBootstrap.
   UniqueId getUniqueId() const;

diff --git a/include/mscclpp/gpu.hpp b/include/mscclpp/gpu.hpp
@@ -6,6 +6,8 @@
 
 #if defined(__HIP_PLATFORM_AMD__)
 
+// #include <hip/hip_bf16.h>
+#include <hip/hip_fp16.h>
 #include <hip/hip_runtime.h>
 
 using cudaError_t = hipError_t;
@@ -61,6 +63,8 @@ constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWri
 #define cudaMemcpy(...) hipMemcpy(__VA_ARGS__)
 #define cudaMemcpyAsync(...) hipMemcpyAsync(__VA_ARGS__)
 #define cudaMemcpyToSymbol(...) hipMemcpyToSymbol(__VA_ARGS__)
+#define cudaMemcpyToSymbolAsync(...) hipMemcpyToSymbolAsync(__VA_ARGS__)
+#define cudaStreamCreate(...) hipStreamCreate(__VA_ARGS__)
 #define cudaStreamCreateWithFlags(...) hipStreamCreateWithFlags(__VA_ARGS__)
 #define cudaStreamSynchronize(...) hipStreamSynchronize(__VA_ARGS__)
 #define cudaStreamBeginCapture(...) hipStreamBeginCapture(__VA_ARGS__)
@@ -90,6 +94,12 @@ constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWri
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
+#if (CUDART_VERSION >= 11000)
+#include <cuda_bf16.h>
+#endif
+#if (CUDART_VERSION >= 11080)
+#include <cuda_fp8.h>
+#endif
 
 #endif
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -7,7 +7,7 @@ build-backend = "scikit_build_core.build"
 
 [project]
 name = "mscclpp"
-version = "0.4.2"
+version = "0.4.3"
 
 [tool.scikit-build]
 cmake.minimum-version = "3.25.0"

diff --git a/python/mscclpp/CMakeLists.txt b/python/mscclpp/CMakeLists.txt
@@ -9,6 +9,6 @@ FetchContent_MakeAvailable(nanobind)
 file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cpp)
 nanobind_add_module(mscclpp_py ${SOURCES})
 set_target_properties(mscclpp_py PROPERTIES OUTPUT_NAME _mscclpp)
-target_link_libraries(mscclpp_py PRIVATE ${GPU_LIBRARIES} mscclpp_static)
-target_include_directories(mscclpp_py PRIVATE ${GPU_INCLUDE_DIRS})
+target_link_libraries(mscclpp_py PRIVATE mscclpp_static ${GPU_LIBRARIES})
+target_include_directories(mscclpp_py SYSTEM PRIVATE ${GPU_INCLUDE_DIRS})
 install(TARGETS mscclpp_py LIBRARY DESTINATION .)
diff --git a/python/mscclpp/core_py.cpp b/python/mscclpp/core_py.cpp
@@ -63,7 +63,7 @@ void register_core(nb::module_& m) {
       .def_static(
           "create", [](int rank, int nRanks) { return std::make_shared<TcpBootstrap>(rank, nRanks); }, nb::arg("rank"),
           nb::arg("nRanks"))
-      .def("create_unique_id", &TcpBootstrap::createUniqueId)
+      .def_static("create_unique_id", &TcpBootstrap::createUniqueId)
       .def("get_unique_id", &TcpBootstrap::getUniqueId)
       .def("initialize", static_cast<void (TcpBootstrap::*)(UniqueId, int64_t)>(&TcpBootstrap::initialize),
            nb::call_guard<nb::gil_scoped_release>(), nb::arg("uniqueId"), nb::arg("timeoutSec") = 30)

diff --git a/python/requirements_cu11.txt → python/requirements_cuda11.txt b/python/requirements_cu11.txt → python/requirements_cuda11.txt
diff --git a/python/requirements_cu12.txt → python/requirements_cuda12.txt b/python/requirements_cu12.txt → python/requirements_cuda12.txt
diff --git a/python/test/CMakeLists.txt b/python/test/CMakeLists.txt
@@ -9,5 +9,5 @@ FetchContent_MakeAvailable(nanobind)
 file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cpp)
 nanobind_add_module(mscclpp_py_test ${SOURCES})
 set_target_properties(mscclpp_py_test PROPERTIES OUTPUT_NAME _ext)
-target_link_libraries(mscclpp_py_test PRIVATE ${GPU_LIBRARIES} mscclpp_static)
-target_include_directories(mscclpp_py_test PRIVATE ${GPU_INCLUDE_DIRS})
+target_link_libraries(mscclpp_py_test PRIVATE mscclpp_static ${GPU_LIBRARIES})
+target_include_directories(mscclpp_py_test SYSTEM PRIVATE ${GPU_INCLUDE_DIRS})