Skip to content

Commit

Permalink
Merge branch 'main' into saemal/use_atomic_sem
Browse files Browse the repository at this point in the history
  • Loading branch information
chhwang authored Oct 11, 2023
2 parents d2cdef2 + 8c0f9e8 commit c9e2381
Show file tree
Hide file tree
Showing 21 changed files with 131 additions and 160 deletions.
10 changes: 2 additions & 8 deletions .github/workflows/codeql.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
name: Analyze
runs-on: 'ubuntu-latest'
container:
image: ghcr.io/microsoft/mscclpp/mscclpp:base-${{ matrix.cuda-version }}
image: ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda-version }}

permissions:
actions: read
Expand All @@ -27,7 +27,7 @@ jobs:

steps:
- name: Checkout repository
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: Check disk space
run: |
Expand All @@ -38,12 +38,6 @@ jobs:
with:
languages: ${{ matrix.language }}

- name: Install cmake
run: |
curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
sudo ln -s /tmp/cmake-3.26.4-linux-x86_64/bin/cmake /usr/bin/cmake
- name: Dubious ownership exception
run: |
git config --global --add safe.directory /__w/mscclpp/mscclpp
Expand Down
13 changes: 2 additions & 11 deletions .github/workflows/integration-test-backup.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,22 +13,17 @@ jobs:
cuda: [ cuda11.8, cuda12.1 ]

container:
image: "ghcr.io/microsoft/mscclpp/mscclpp:base-${{ matrix.cuda }}"
image: "ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda }}"
options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1

steps:
- name: Checkout
uses: actions/checkout@v4

- name: Install CMake
run: |
curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
- name: Build
run: |
mkdir build && cd build
MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release ..
MPI_HOME=/usr/local/mpi cmake -DCMAKE_BUILD_TYPE=Release ..
make -j
- name: Lock GPU clock frequency
Expand All @@ -41,7 +36,6 @@ jobs:
- name: Run mscclpp AllGather test
run: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
Expand All @@ -50,13 +44,11 @@ jobs:
- name: Run mscclpp SendRecv test
run: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
- name: Run mscclpp AllReduce test
run: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
Expand All @@ -68,7 +60,6 @@ jobs:
- name: Run mscclpp AllToAll test
run: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
Expand Down
12 changes: 6 additions & 6 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:

steps:
- name: Check out Git repository
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: Install ClangFormat
run: |
Expand All @@ -28,25 +28,25 @@ jobs:

steps:
- name: Check out Git repository
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: 3.8
python-version: 3

- name: Install Python dependencies
run: python3.8 -m pip install black
run: python3 -m pip install black

- name: Run black
run: python3.8 -m black --check --config pyproject.toml .
run: python3 -m black --check --config pyproject.toml .

spelling:
runs-on: ubuntu-20.04

steps:
- name: Check out Git repository
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: Download misspell
run: |
Expand Down
17 changes: 2 additions & 15 deletions .github/workflows/ut-backup.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
cuda: [ cuda11.8, cuda12.1 ]

container:
image: "ghcr.io/microsoft/mscclpp/mscclpp:base-${{ matrix.cuda }}"
image: "ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda }}"
options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1

steps:
Expand All @@ -23,10 +23,8 @@ jobs:

- name: Build
run: |
curl -L -C- https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
mkdir build && cd build
MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release ..
MPI_HOME=/usr/local/mpi cmake -DCMAKE_BUILD_TYPE=Release ..
make -j
working-directory: ${{ github.workspace }}

Expand All @@ -36,31 +34,20 @@ jobs:
for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
done
working-directory: ${{ github.workspace }}
- name: UnitTests
run: |
./build/test/unit_tests
working-directory: ${{ github.workspace }}
- name: MpUnitTests
run: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
mpirun --allow-run-as-root -tag-output -np 2 ./build/test/mp_unit_tests
mpirun --allow-run-as-root -tag-output -np 4 ./build/test/mp_unit_tests
mpirun --allow-run-as-root -tag-output -np 8 ./build/test/mp_unit_tests
working-directory: ${{ github.workspace }}
- name: PyTests
run: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
cd build && make pylib-copy
if [[ '${{ matrix.cuda }}' == 'cuda11'* ]]; then
python3 -m pip install -r ../python/test/requirements_cu11.txt
else
python3 -m pip install -r ../python/test/requirements_cu12.txt
fi
mpirun --allow-run-as-root -tag-output -np 8 $(which pytest) ../python/test/test_mscclpp.py -x
working-directory: ${{ github.workspace }}
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Licensed under the MIT license.

set(MSCCLPP_MAJOR "0")
set(MSCCLPP_MINOR "2")
set(MSCCLPP_MINOR "3")
set(MSCCLPP_PATCH "0")

set(MSCCLPP_SOVERSION ${MSCCLPP_MAJOR})
Expand Down
36 changes: 10 additions & 26 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ MSCCL++ is a development kit for implementing highly optimized distributed GPU a

* **Runtime Performance Optimization for Dynamic Workload.** As we can easily implement flexible communication logics, we can optimize communication performance even during runtime. For example, we can implement the system to automatically choose different communication paths or different collective communication algorithms depending on the dynamic workload at runtime.

## Key Features (v0.2)
## Key Features (v0.3)

MSCCL++ v0.2 supports the following features.
MSCCL++ v0.3 supports the following features.

### In-Kernel Communication Interfaces

Expand Down Expand Up @@ -124,31 +124,15 @@ Customized proxies can be used for conducting a series of pre-defined data trans

Most of key components of MSCCL++ are designed to be easily customized. This enables MSCCL++ to easily adopt a new software / hardware technology and lets users implement algorithms optimized for their own use cases.

## Status & Roadmap
### New in MSCCL++ v0.3 (Latest Release)
* Updated interfaces
* Add Python bindings and interfaces
* Add Python unit tests
* Add more configurable parameters
* Add a new single-node AllReduce kernel
* Fix bugs

MSCCL++ is under active development and a part of its features will be added in a future release. The following describes key features of each version.

### MSCCL++ v0.4 (TBU)
* Automatic task scheduler
* Dynamic performance tuning

### MSCCL++ v0.3 (TBU)
* Tile-based communication: efficient transport of 2D data patches (tiles)
* GPU computation interfaces

### MSCCL++ v0.2 (Latest Release)
* Basic communication functionalities and new interfaces
- GPU-side communication interfaces
- Host-side helpers: bootstrap, communicator, and proxy
- Supports both NVLink and InfiniBand
- Supports both in-SM copy and DMA/RDMA
* Communication performance optimization
- Example code outperforms NCCL/MSCCL AllGather/AllReduce/AllToAll
* Development pipeline
* Documentation

### MSCCL++ v0.1
* Proof-of-concept, preliminary interfaces
See details from https://github.com/microsoft/mscclpp/issues/89.

## Contributing

Expand Down
11 changes: 8 additions & 3 deletions docker/base-cuda12.1.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@ LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp

ENV DEBIAN_FRONTEND=noninteractive

RUN apt-get update && \
RUN rm -rf /opt/nvidia

RUN apt-get clean && \
apt-get update && \
apt-get install -y --no-install-recommends \
build-essential \
ca-certificates \
Expand Down Expand Up @@ -47,8 +50,10 @@ RUN cd /tmp && \
cd .. && \
rm -rf /tmp/openmpi-${OPENMPI_VERSION}*

ENV PATH="${PATH}:/usr/local/mpi/bin" \
LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/mpi/lib:/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64"
ENV PATH="/usr/local/mpi/bin:${PATH}" \
LD_LIBRARY_PATH="/usr/local/mpi/lib:/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64:${LD_LIBRARY_PATH}"

RUN echo PATH="${PATH}" > /etc/environment && \
echo LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" >> /etc/environment

ENTRYPOINT []
28 changes: 28 additions & 0 deletions docker/dev-cuda11.8.dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
FROM ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8

LABEL maintainer="MSCCL++"
LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp

ENV MSCCLPP_SRC_DIR="/tmp/mscclpp" \
CMAKE_VERSION="3.26.4"

ADD . ${MSCCLPP_SRC_DIR}
WORKDIR ${MSCCLPP_SRC_DIR}

# Install cmake 3.26.4
ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \
CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz"
RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \
tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local && \
rm -rf ${CMAKE_HOME}.tar.gz
ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}"

# Install pytest & dependencies
RUN python3 -m pip install --no-cache-dir -r python/test/requirements_cu11.txt

# Set PATH
RUN echo PATH="${PATH}" > /etc/environment

# Cleanup
WORKDIR /
RUN rm -rf ${MSCCLPP_SRC_DIR}
27 changes: 27 additions & 0 deletions docker/dev-cuda12.1.dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
FROM ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1

LABEL maintainer="MSCCL++"
LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp

ENV MSCCLPP_SRC_DIR="/tmp/mscclpp" \
CMAKE_VERSION="3.26.4"

ADD . ${MSCCLPP_SRC_DIR}
WORKDIR ${MSCCLPP_SRC_DIR}

# Install cmake 3.26.4
ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \
CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz"
RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \
tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local
ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}"

# Install pytest & dependencies
RUN python3 -m pip install --no-cache-dir -r python/test/requirements_cu12.txt

# Set PATH
RUN echo PATH="${PATH}" > /etc/environment

# Cleanup
WORKDIR /
RUN rm -rf ${MSCCLPP_SRC_DIR}
5 changes: 4 additions & 1 deletion include/mscclpp/core.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#define MSCCLPP_CORE_HPP_

#define MSCCLPP_MAJOR 0
#define MSCCLPP_MINOR 2
#define MSCCLPP_MINOR 3
#define MSCCLPP_PATCH 0
#define MSCCLPP_VERSION (MSCCLPP_MAJOR * 10000 + MSCCLPP_MINOR * 100 + MSCCLPP_PATCH)

Expand All @@ -24,6 +24,9 @@ namespace mscclpp {
/// Unique ID for a process. This is a MSCCLPP_UNIQUE_ID_BYTES byte array that uniquely identifies a process.
using UniqueId = std::array<uint8_t, MSCCLPP_UNIQUE_ID_BYTES>;

/// Return a version string.
std::string version();

/// Base class for bootstraps.
class Bootstrap {
public:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ build-backend = "scikit_build_core.build"

[project]
name = "mscclpp"
version = "0.2.0"
version = "0.3.0"

[tool.scikit-build]
cmake.minimum-version = "3.25.0"
Expand Down
5 changes: 2 additions & 3 deletions python/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,10 @@ add_subdirectory(test)
add_custom_target(pylib-copy)
add_custom_command(TARGET pylib-copy POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_if_different
${CMAKE_CURRENT_BINARY_DIR}/mscclpp/_mscclpp.cpython-38-x86_64-linux-gnu.so
${CMAKE_CURRENT_BINARY_DIR}/mscclpp/_mscclpp.*.so
${CMAKE_CURRENT_SOURCE_DIR}/mscclpp
COMMAND ${CMAKE_COMMAND} -E copy_if_different
${CMAKE_CURRENT_BINARY_DIR}/test/_ext.cpython-38-x86_64-linux-gnu.so
${CMAKE_CURRENT_BINARY_DIR}/test/_ext.*.so
${CMAKE_CURRENT_SOURCE_DIR}/test/_cpp
COMMAND ${CMAKE_COMMAND} -E echo "Copy python libraries"
)

3 changes: 3 additions & 0 deletions python/mscclpp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,11 @@
TcpBootstrap,
Transport,
TransportFlags,
version,
)

__version__ = version()


def get_include():
"""Return the directory that contains the MSCCL++ headers."""
Expand Down
2 changes: 2 additions & 0 deletions python/mscclpp/core_py.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ void def_nonblocking_future(nb::handle& m, const std::string& typestr) {
}

void register_core(nb::module_& m) {
m.def("version", &version);

nb::class_<Bootstrap>(m, "Bootstrap")
.def("get_rank", &Bootstrap::getRank)
.def("get_n_ranks", &Bootstrap::getNranks)
Expand Down
Loading

0 comments on commit c9e2381

Please sign in to comment.