Skip to content

Commit

Permalink
Merge branch 'main' into qinghuazhou/nccl-test-support-mscclpp-nccl-H100
Browse files Browse the repository at this point in the history
  • Loading branch information
seagater committed Dec 11, 2024
2 parents 5c97835 + 7a3dcb0 commit 37e5951
Show file tree
Hide file tree
Showing 30 changed files with 422 additions and 236 deletions.
168 changes: 168 additions & 0 deletions .azure-pipelines/nccl-api-test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
trigger:
- main

pr:
branches:
include:
- main
drafts: false

jobs:
- job: NcclTest
displayName: Run MSCCLPP over NCCL Test
strategy:
matrix:
cuda11:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
pool:
name: msccl-ci
container:
image: $[ variables['containerImage'] ]

steps:
- checkout: self
- checkout: git://One/msccl-users
- task: Bash@3
name: Build
displayName: Build
inputs:
targetType: 'inline'
script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)/mscclpp'

- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: mscclpp.pem

- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: 'inline'
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: mscclpp-ci
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name mscclpp-ci --resource-group mscclpp
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: mscclpp/test/deploy/deploy.sh
arguments: "nccltest-single-node"
workingDirectory: $(System.DefaultWorkingDirectory)/mscclpp

- task: Bash@3
name: CopyMscclUsers
displayName: Copy msccl-users
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)/msccl-users
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
DST_DIR="/tmp/mscclpp/msccl-users"
parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR}
workingDirectory: '$(System.DefaultWorkingDirectory)'

- task: Bash@3
name: InstallMscclTools
displayName: Install msccl-tools
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
cd /root/mscclpp; \
git clone https://github.com/Azure/msccl-tools.git; \
cd /root/mscclpp/msccl-tools; pip3 install ."'
workingDirectory: '$(System.DefaultWorkingDirectory)'

- task: Bash@3
name: GenerateExecutionFile
displayName: Generate execution file
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
cd /root/mscclpp/msccl-users; \
mkdir -p execution-files; \
cd /root/mscclpp/msccl-users; \
bash algos/mscclpp_a100/generate_execution_plan.sh"'
workingDirectory: '$(System.DefaultWorkingDirectory)'

- task: Bash@3
name: InstallNcclTests
displayName: Install NCCL Tests
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
cd; git clone https://github.com/NVIDIA/nccl-tests.git; \
cd nccl-tests; \
MPI=1 MPI_HOME=/usr/local/mpi make -j"'
workingDirectory: '$(System.DefaultWorkingDirectory)'

- task: Bash@3
name: RunNcclAllreduceTest
displayName: Run NCCL Allreduce Test
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
cd /root/mscclpp; \
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
workingDirectory: '$(System.DefaultWorkingDirectory)'

- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: mscclpp-ci
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name mscclpp-ci --resource-group mscclpp
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ MSCCL++ provides peer-to-peer communication methods between GPUs. A peer-to-peer

```cpp
// `ProxyChannel` will be explained in the following section.
__device__ mscclpp::DeviceHandle<mscclpp::SimpleProxyChannel> channel;
__device__ mscclpp::DeviceHandle<mscclpp::ProxyChannel> channel;
__global__ void gpuKernel() {
...
// Only one thread is needed for this method.
Expand Down
1 change: 1 addition & 0 deletions docker/base-dev-x.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ ADD . /tmp/mscclpp
WORKDIR /tmp/mscclpp
ARG TARGET="cuda12.1"
RUN target_type=$(echo $TARGET | sed 's/\.[0-9]*$//') && \
python3 -m pip install --no-cache-dir --upgrade pip && \
python3 -m pip install --no-cache-dir -r python/requirements_${target_type}.txt

# Set PATH
Expand Down
6 changes: 3 additions & 3 deletions docs/design/design.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ In this section, we will discuss several use cases that demonstrate the capabili

MSCCL++ enables the offloading of communication logic from the GPU to the CPU, facilitating the overlapping of communication and computation processes. The code snippet provided illustrates this overlapping technique. In the depicted scenario, the GPU emits a signal to the CPU indicating readiness for data transfer. Subsequently, while the GPU continues to execute computation tasks, the CPU initiates the data transfer to the designated target device.
```cpp
__device__ void gpuKernel(mscclpp::SimpleProxyChannelDeviceHandle* proxyChannel) {
__device__ void gpuKernel(mscclpp::ProxyChannelDeviceHandle* proxyChannel) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
// Send a trigger to the CPU
if (tid == 0) {
Expand All @@ -138,11 +138,11 @@ Traditional communication libraries enforce a separation between communication a
MCSCL++ offers a low-level communication API, allowing users to design customized collective communication algorithms. The following code demonstrates how to implement a customized All2All algorithm using MSCCL++.
```cpp
using DeviceHandle = mscclpp::DeviceHandle<T>;
__device__ void localAlltoall(DeviceHandle<mscclpp::SimpleProxyChannel>* proxyChans, int rank,
__device__ void localAlltoall(DeviceHandle<mscclpp::ProxyChannel>* proxyChans, int rank,
int nRanksPerNode, size_t nElements) {
int remoteRank = ((int)blockIdx.x < rank) ? blockIdx.x : blockIdx.x + 1;
for (int i = 1; i < nRanksPerNode; i++) {
DeviceHandle<mscclpp::SimpleProxyChannel> proxyChan = proxyChans[blockIdx.x];
DeviceHandle<mscclpp::ProxyChannel> proxyChan = proxyChans[blockIdx.x];
if (threadIdx.x == 0 && remoteRank % nRanksPerNode == (rank + i) % nRanksPerNode) {
proxyChan.putWithSignalAndFlush(rank * nElements * sizeof(int), remoteRank * nElements * sizeof(int),
nElements * sizeof(int));
Expand Down
10 changes: 5 additions & 5 deletions docs/getting-started/tutorials/initialization.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ We will setup a mesh topology with eight GPUs. Each GPU will be connected to its

template <class T>
using DeviceHandle = mscclpp::DeviceHandle<T>;
__constant__ DeviceHandle<mscclpp::SimpleProxyChannel> constProxyChans[8];
__constant__ DeviceHandle<mscclpp::ProxyChannel> constProxyChans[8];

void setupMeshTopology(int rank, int worldsize, void* data, size_t dataSize) {
std::string ip_port = "10.0.0.4:50000";
Expand Down Expand Up @@ -55,17 +55,17 @@ void setupMeshTopology(int rank, int worldsize, void* data, size_t dataSize) {

comm.setup();

std::vector<DeviceHandle<mscclpp::SimpleProxyChannel>> proxyChannels;
std::vector<DeviceHandle<mscclpp::ProxyChannel>> proxyChannels;
for (size_t i = 0; i < semaphoreIds.size(); ++i) {
proxyChannels.push_back(mscclpp::deviceHandle(mscclpp::SimpleProxyChannel(
proxyChannels.push_back(mscclpp::deviceHandle(mscclpp::ProxyChannel(
proxyService.proxyChannel(semaphoreIds[i]), proxyService.addMemory(remoteMemories[i].get()),
proxyService.addMemory(localMemories[i]))));
}

if (proxyChannels.size() > sizeof(constProxyChans) / sizeof(DeviceHandle<mscclpp::SimpleProxyChannel>)) {
if (proxyChannels.size() > sizeof(constProxyChans) / sizeof(DeviceHandle<mscclpp::ProxyChannel>)) {
std::runtime_error("unexpected error");
}
CUDACHECK(cudaMemcpyToSymbol(constProxyChans, proxyChannels.data(),
sizeof(DeviceHandle<mscclpp::SimpleProxyChannel>) * proxyChannels.size()));
sizeof(DeviceHandle<mscclpp::ProxyChannel>) * proxyChannels.size()));
}
```
4 changes: 2 additions & 2 deletions docs/getting-started/tutorials/python-api.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ We provide some Python utils to help you launch kernel via python. Here is a exa
```python
from mscclpp.utils import KernelBuilder, pack

def launch_kernel(my_rank: int, nranks: int, simple_channels: List[SimpleProxyChannel], memory: cp.ndarray):
def launch_kernel(my_rank: int, nranks: int, simple_channels: List[ProxyChannel], memory: cp.ndarray):
file_dir = os.path.dirname(os.path.abspath(__file__))
kernel = KernelBuilder(file="test.cu", kernel_name="test", file_dir=file_dir).get_compiled_kernel()
params = b""
Expand Down Expand Up @@ -77,7 +77,7 @@ The test kernel is defined in `test.cu` as follows:
// be careful about using channels[my_rank] as it is inavlie and it is there just for simplicity of indexing
extern "C" __global__ void __launch_bounds__(1024, 1)
simple_proxy_channel(mscclpp::SimpleProxyChannelDeviceHandle* channels, int my_rank, int nranks,
proxy_channel(mscclpp::ProxyChannelDeviceHandle* channels, int my_rank, int nranks,
int num_elements) {
int tid = threadIdx.x;
int nthreads = blockDim.x;
Expand Down
59 changes: 33 additions & 26 deletions include/mscclpp/proxy_channel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

namespace mscclpp {

struct BaseProxyChannel;
struct ProxyChannel;

/// Base class for proxy services. Proxy services are used to proxy data between devices.
Expand Down Expand Up @@ -48,10 +49,17 @@ class ProxyService : public BaseProxyService {
/// @return The semaphore.
std::shared_ptr<Host2DeviceSemaphore> semaphore(SemaphoreId id) const;

/// Get a proxy channel by semaphore ID.
/// Get a base proxy channel by semaphore ID.
/// @param id The ID of the semaphore.
/// @return The base proxy channel.
BaseProxyChannel baseProxyChannel(SemaphoreId id);

/// Get a proxy channel by semaphore ID and memory regions.
/// @param id The ID of the semaphore.
/// @param dst The destination memory region.
/// @param src The source memory region.
/// @return The proxy channel.
ProxyChannel proxyChannel(SemaphoreId id);
ProxyChannel proxyChannel(SemaphoreId id, MemoryId dst, MemoryId src);

/// Start the proxy service.
void startProxy();
Expand All @@ -71,66 +79,65 @@ class ProxyService : public BaseProxyService {
};

/// Proxy channel.
struct ProxyChannel {
private:
struct BaseProxyChannel {
protected:
SemaphoreId semaphoreId_;

std::shared_ptr<Host2DeviceSemaphore> semaphore_;

std::shared_ptr<Proxy> proxy_;

public:
ProxyChannel() = default;
BaseProxyChannel() = default;

ProxyChannel(SemaphoreId semaphoreId, std::shared_ptr<Host2DeviceSemaphore> semaphore, std::shared_ptr<Proxy> proxy);
BaseProxyChannel(SemaphoreId semaphoreId, std::shared_ptr<Host2DeviceSemaphore> semaphore,
std::shared_ptr<Proxy> proxy);

ProxyChannel(const ProxyChannel& other) = default;
BaseProxyChannel(const BaseProxyChannel& other) = default;

ProxyChannel& operator=(ProxyChannel& other) = default;
BaseProxyChannel& operator=(BaseProxyChannel& other) = default;

/// Device-side handle for @ref ProxyChannel.
using DeviceHandle = ProxyChannelDeviceHandle;
/// Device-side handle for @ref BaseProxyChannel.
using DeviceHandle = BaseProxyChannelDeviceHandle;

/// Returns the device-side handle.
///
/// User should make sure the ProxyChannel is not released when using the returned handle.
/// User should make sure the BaseProxyChannel is not released when using the returned handle.
///
DeviceHandle deviceHandle() const;
};

/// Simple proxy channel with a single destination and source memory region.
struct SimpleProxyChannel {
/// A common form of proxy channel with a single destination and source memory region.
struct ProxyChannel : public BaseProxyChannel {
private:
ProxyChannel proxyChan_;
MemoryId dst_;
MemoryId src_;

public:
/// Default constructor.
SimpleProxyChannel() = default;
ProxyChannel() = default;

/// Constructor.
/// @param proxyChan The proxy channel.
/// @param semaphoreId The ID of the semaphore.
/// @param semaphore The semaphore.
/// @param proxy The proxy.
/// @param dst The destination memory region.
/// @param src The source memory region.
SimpleProxyChannel(ProxyChannel proxyChan, MemoryId dst, MemoryId src);

/// Constructor.
/// @param proxyChan The proxy channel.
SimpleProxyChannel(ProxyChannel proxyChan) : proxyChan_(proxyChan) {}
ProxyChannel(SemaphoreId semaphoreId, std::shared_ptr<Host2DeviceSemaphore> semaphore, std::shared_ptr<Proxy> proxy,
MemoryId dst, MemoryId src);

/// Copy constructor.
SimpleProxyChannel(const SimpleProxyChannel& other) = default;
ProxyChannel(const ProxyChannel& other) = default;

/// Assignment operator.
SimpleProxyChannel& operator=(SimpleProxyChannel& other) = default;
ProxyChannel& operator=(ProxyChannel& other) = default;

/// Device-side handle for @ref SimpleProxyChannel.
using DeviceHandle = SimpleProxyChannelDeviceHandle;
/// Device-side handle for @ref ProxyChannel.
using DeviceHandle = ProxyChannelDeviceHandle;

/// Returns the device-side handle.
///
/// User should make sure the SimpleProxyChannel is not released when using the returned handle.
/// User should make sure the ProxyChannel is not released when using the returned handle.
///
DeviceHandle deviceHandle() const;
};
Expand Down
Loading

0 comments on commit 37e5951

Please sign in to comment.