Merge branch 'main' into qinghuazhou/nccl-test-support-mscclpp-nccl-H100

microsoft · Dec 11, 2024 · 37e5951 · 37e5951
2 parents 5c97835 + 7a3dcb0
commit 37e5951
Show file tree

Hide file tree

Showing 30 changed files with 422 additions and 236 deletions.
diff --git a/.azure-pipelines/nccl-api-test.yaml b/.azure-pipelines/nccl-api-test.yaml
@@ -0,0 +1,168 @@
+trigger:
+- main
+
+pr:
+  branches:
+    include:
+    - main
+  drafts: false
+
+jobs:
+- job: NcclTest
+  displayName: Run MSCCLPP over NCCL Test
+  strategy:
+    matrix:
+      cuda11:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
+      cuda12:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
+  pool:
+    name: msccl-ci
+  container:
+    image: $[ variables['containerImage'] ]
+
+  steps:
+  - checkout: self
+  - checkout: git://One/msccl-users
+  - task: Bash@3
+    name: Build
+    displayName: Build
+    inputs:
+      targetType: 'inline'
+      script: |
+        mkdir build && cd build
+        cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
+        make -j
+      workingDirectory: '$(System.DefaultWorkingDirectory)/mscclpp'
+
+  - task: DownloadSecureFile@1
+    name: SshKeyFile
+    displayName: Download key file
+    inputs:
+      secureFile: mscclpp.pem
+
+  - task: Bash@3
+    name: InstallPackages
+    displayName: Install Packages
+    inputs:
+      targetType: 'inline'
+      script: |
+        sudo apt-get update -y
+        sudo apt-get install pssh -y
+        curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
+
+  - task: AzureCLI@2
+    name: StartVMSS
+    displayName: Start VMSS
+    inputs:
+      azureSubscription: mscclpp-ci
+      scriptType: bash
+      scriptLocation: inlineScript
+      inlineScript: |
+        az vmss start --name mscclpp-ci --resource-group mscclpp
+
+  - task: Bash@3
+    name: DeployTestEnv
+    displayName: Deploy Test Env
+    inputs:
+      targetType: filePath
+      filePath: mscclpp/test/deploy/deploy.sh
+      arguments: "nccltest-single-node"
+      workingDirectory: $(System.DefaultWorkingDirectory)/mscclpp
+
+  - task: Bash@3
+    name: CopyMscclUsers
+    displayName: Copy msccl-users
+    inputs:
+      targetType: 'inline'
+      script: |
+        set -e
+        HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
+        ROOT_DIR=$(System.DefaultWorkingDirectory)/msccl-users
+        SSH_OPTION="StrictHostKeyChecking=no"
+        KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
+        DST_DIR="/tmp/mscclpp/msccl-users"
+        parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR}
+      workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+  - task: Bash@3
+    name: InstallMscclTools
+    displayName: Install msccl-tools
+    inputs:
+      targetType: 'inline'
+      script: |
+        set -e
+        HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
+        SSH_OPTION="StrictHostKeyChecking=no"
+        KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
+        parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"   \
+          -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
+          cd /root/mscclpp;                                          \
+          git clone https://github.com/Azure/msccl-tools.git;        \
+          cd /root/mscclpp/msccl-tools; pip3 install ."'
+      workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+  - task: Bash@3
+    name: GenerateExecutionFile
+    displayName: Generate execution file
+    inputs:
+      targetType: 'inline'
+      script: |
+        set -e
+        HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
+        ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
+        SSH_OPTION="StrictHostKeyChecking=no"
+        KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
+        parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
+          -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
+          cd /root/mscclpp/msccl-users;  \
+          mkdir -p execution-files;      \
+          cd /root/mscclpp/msccl-users;  \
+          bash algos/mscclpp_a100/generate_execution_plan.sh"'
+      workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+  - task: Bash@3
+    name: InstallNcclTests
+    displayName: Install NCCL Tests
+    inputs:
+      targetType: 'inline'
+      script: |
+        set -e
+        HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
+        ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
+        SSH_OPTION="StrictHostKeyChecking=no"
+        KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
+        parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"   \
+          -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
+          cd; git clone https://github.com/NVIDIA/nccl-tests.git;    \
+          cd nccl-tests;                                             \
+          MPI=1 MPI_HOME=/usr/local/mpi make -j"'
+      workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+  - task: Bash@3
+    name: RunNcclAllreduceTest
+    displayName: Run NCCL Allreduce Test
+    inputs:
+      targetType: 'inline'
+      script: |
+        set -e
+        HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
+        ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
+        SSH_OPTION="StrictHostKeyChecking=no"
+        KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
+        parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
+          -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
+          cd /root/mscclpp;                                         \
+          mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN  -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
+      workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+  - task: AzureCLI@2
+    name: StopVMSS
+    displayName: Deallocate VMSS
+    condition: always()
+    inputs:
+      azureSubscription: mscclpp-ci
+      scriptType: bash
+      scriptLocation: inlineScript
+      inlineScript: |
+        az vmss deallocate --name mscclpp-ci --resource-group mscclpp
diff --git a/README.md b/README.md
@@ -50,7 +50,7 @@ MSCCL++ provides peer-to-peer communication methods between GPUs. A peer-to-peer
 
 ```cpp
 // `ProxyChannel` will be explained in the following section.
-__device__ mscclpp::DeviceHandle<mscclpp::SimpleProxyChannel> channel;
+__device__ mscclpp::DeviceHandle<mscclpp::ProxyChannel> channel;
 __global__ void gpuKernel() {
   ...
   // Only one thread is needed for this method.

diff --git a/docker/base-dev-x.dockerfile b/docker/base-dev-x.dockerfile
@@ -28,6 +28,7 @@ ADD . /tmp/mscclpp
 WORKDIR /tmp/mscclpp
 ARG TARGET="cuda12.1"
 RUN target_type=$(echo $TARGET | sed 's/\.[0-9]*$//') && \
+    python3 -m pip install --no-cache-dir --upgrade pip && \
     python3 -m pip install --no-cache-dir -r python/requirements_${target_type}.txt
 
 # Set PATH

diff --git a/docs/design/design.md b/docs/design/design.md
@@ -117,7 +117,7 @@ In this section, we will discuss several use cases that demonstrate the capabili
 
 MSCCL++ enables the offloading of communication logic from the GPU to the CPU, facilitating the overlapping of communication and computation processes. The code snippet provided illustrates this overlapping technique. In the depicted scenario, the GPU emits a signal to the CPU indicating readiness for data transfer. Subsequently, while the GPU continues to execute computation tasks, the CPU initiates the data transfer to the designated target device.
 ```cpp
-__device__ void gpuKernel(mscclpp::SimpleProxyChannelDeviceHandle* proxyChannel) {
+__device__ void gpuKernel(mscclpp::ProxyChannelDeviceHandle* proxyChannel) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   // Send a trigger to the CPU
   if (tid == 0) {
@@ -138,11 +138,11 @@ Traditional communication libraries enforce a separation between communication a
 MCSCL++ offers a low-level communication API, allowing users to design customized collective communication algorithms. The following code demonstrates how to implement a customized All2All algorithm using MSCCL++.
 ```cpp
 using DeviceHandle = mscclpp::DeviceHandle<T>;
-__device__ void localAlltoall(DeviceHandle<mscclpp::SimpleProxyChannel>* proxyChans, int rank,
+__device__ void localAlltoall(DeviceHandle<mscclpp::ProxyChannel>* proxyChans, int rank,
                               int nRanksPerNode, size_t nElements) {
   int remoteRank = ((int)blockIdx.x < rank) ? blockIdx.x : blockIdx.x + 1;
   for (int i = 1; i < nRanksPerNode; i++) {
-    DeviceHandle<mscclpp::SimpleProxyChannel> proxyChan = proxyChans[blockIdx.x];
+    DeviceHandle<mscclpp::ProxyChannel> proxyChan = proxyChans[blockIdx.x];
     if (threadIdx.x == 0 && remoteRank % nRanksPerNode == (rank + i) % nRanksPerNode) {
       proxyChan.putWithSignalAndFlush(rank * nElements * sizeof(int), remoteRank * nElements * sizeof(int),
                                       nElements * sizeof(int));

diff --git a/docs/getting-started/tutorials/initialization.md b/docs/getting-started/tutorials/initialization.md
@@ -21,7 +21,7 @@ We will setup a mesh topology with eight GPUs. Each GPU will be connected to its
 
 template <class T>
 using DeviceHandle = mscclpp::DeviceHandle<T>;
-__constant__ DeviceHandle<mscclpp::SimpleProxyChannel> constProxyChans[8];
+__constant__ DeviceHandle<mscclpp::ProxyChannel> constProxyChans[8];
 
 void setupMeshTopology(int rank, int worldsize, void* data, size_t dataSize) {
   std::string ip_port = "10.0.0.4:50000";
@@ -55,17 +55,17 @@ void setupMeshTopology(int rank, int worldsize, void* data, size_t dataSize) {
 
   comm.setup();
 
-  std::vector<DeviceHandle<mscclpp::SimpleProxyChannel>> proxyChannels;
+  std::vector<DeviceHandle<mscclpp::ProxyChannel>> proxyChannels;
   for (size_t i = 0; i < semaphoreIds.size(); ++i) {
-    proxyChannels.push_back(mscclpp::deviceHandle(mscclpp::SimpleProxyChannel(
+    proxyChannels.push_back(mscclpp::deviceHandle(mscclpp::ProxyChannel(
         proxyService.proxyChannel(semaphoreIds[i]), proxyService.addMemory(remoteMemories[i].get()),
         proxyService.addMemory(localMemories[i]))));
   }
 
-  if (proxyChannels.size() > sizeof(constProxyChans) / sizeof(DeviceHandle<mscclpp::SimpleProxyChannel>)) {
+  if (proxyChannels.size() > sizeof(constProxyChans) / sizeof(DeviceHandle<mscclpp::ProxyChannel>)) {
     std::runtime_error("unexpected error");
   }
   CUDACHECK(cudaMemcpyToSymbol(constProxyChans, proxyChannels.data(),
-                              sizeof(DeviceHandle<mscclpp::SimpleProxyChannel>) * proxyChannels.size()));
+                              sizeof(DeviceHandle<mscclpp::ProxyChannel>) * proxyChannels.size()));
 }
 ```
diff --git a/docs/getting-started/tutorials/python-api.md b/docs/getting-started/tutorials/python-api.md
@@ -47,7 +47,7 @@ We provide some Python utils to help you launch kernel via python. Here is a exa
 ```python
 from mscclpp.utils import KernelBuilder, pack
 
-def launch_kernel(my_rank: int, nranks: int, simple_channels: List[SimpleProxyChannel], memory: cp.ndarray):
+def launch_kernel(my_rank: int, nranks: int, simple_channels: List[ProxyChannel], memory: cp.ndarray):
     file_dir = os.path.dirname(os.path.abspath(__file__))
     kernel = KernelBuilder(file="test.cu", kernel_name="test", file_dir=file_dir).get_compiled_kernel()
     params = b""
@@ -77,7 +77,7 @@ The test kernel is defined in `test.cu` as follows:
 
 // be careful about using channels[my_rank] as it is inavlie and it is there just for simplicity of indexing
 extern "C" __global__ void __launch_bounds__(1024, 1)
-    simple_proxy_channel(mscclpp::SimpleProxyChannelDeviceHandle* channels, int my_rank, int nranks,
+    proxy_channel(mscclpp::ProxyChannelDeviceHandle* channels, int my_rank, int nranks,
                          int num_elements) {
     int tid = threadIdx.x;
     int nthreads = blockDim.x;

diff --git a/include/mscclpp/proxy_channel.hpp b/include/mscclpp/proxy_channel.hpp
@@ -11,6 +11,7 @@
 
 namespace mscclpp {
 
+struct BaseProxyChannel;
 struct ProxyChannel;
 
 /// Base class for proxy services. Proxy services are used to proxy data between devices.
@@ -48,10 +49,17 @@ class ProxyService : public BaseProxyService {
   /// @return The semaphore.
   std::shared_ptr<Host2DeviceSemaphore> semaphore(SemaphoreId id) const;
 
-  /// Get a proxy channel by semaphore ID.
+  /// Get a base proxy channel by semaphore ID.
   /// @param id The ID of the semaphore.
+  /// @return The base proxy channel.
+  BaseProxyChannel baseProxyChannel(SemaphoreId id);
+
+  /// Get a proxy channel by semaphore ID and memory regions.
+  /// @param id The ID of the semaphore.
+  /// @param dst The destination memory region.
+  /// @param src The source memory region.
   /// @return The proxy channel.
-  ProxyChannel proxyChannel(SemaphoreId id);
+  ProxyChannel proxyChannel(SemaphoreId id, MemoryId dst, MemoryId src);
 
   /// Start the proxy service.
   void startProxy();
@@ -71,66 +79,65 @@ class ProxyService : public BaseProxyService {
 };
 
 /// Proxy channel.
-struct ProxyChannel {
- private:
+struct BaseProxyChannel {
+ protected:
   SemaphoreId semaphoreId_;
 
   std::shared_ptr<Host2DeviceSemaphore> semaphore_;
 
   std::shared_ptr<Proxy> proxy_;
 
  public:
-  ProxyChannel() = default;
+  BaseProxyChannel() = default;
 
-  ProxyChannel(SemaphoreId semaphoreId, std::shared_ptr<Host2DeviceSemaphore> semaphore, std::shared_ptr<Proxy> proxy);
+  BaseProxyChannel(SemaphoreId semaphoreId, std::shared_ptr<Host2DeviceSemaphore> semaphore,
+                   std::shared_ptr<Proxy> proxy);
 
-  ProxyChannel(const ProxyChannel& other) = default;
+  BaseProxyChannel(const BaseProxyChannel& other) = default;
 
-  ProxyChannel& operator=(ProxyChannel& other) = default;
+  BaseProxyChannel& operator=(BaseProxyChannel& other) = default;
 
-  /// Device-side handle for @ref ProxyChannel.
-  using DeviceHandle = ProxyChannelDeviceHandle;
+  /// Device-side handle for @ref BaseProxyChannel.
+  using DeviceHandle = BaseProxyChannelDeviceHandle;
 
   /// Returns the device-side handle.
   ///
-  /// User should make sure the ProxyChannel is not released when using the returned handle.
+  /// User should make sure the BaseProxyChannel is not released when using the returned handle.
   ///
   DeviceHandle deviceHandle() const;
 };
 
-/// Simple proxy channel with a single destination and source memory region.
-struct SimpleProxyChannel {
+/// A common form of proxy channel with a single destination and source memory region.
+struct ProxyChannel : public BaseProxyChannel {
  private:
-  ProxyChannel proxyChan_;
   MemoryId dst_;
   MemoryId src_;
 
  public:
   /// Default constructor.
-  SimpleProxyChannel() = default;
+  ProxyChannel() = default;
 
   /// Constructor.
-  /// @param proxyChan The proxy channel.
+  /// @param semaphoreId The ID of the semaphore.
+  /// @param semaphore The semaphore.
+  /// @param proxy The proxy.
   /// @param dst The destination memory region.
   /// @param src The source memory region.
-  SimpleProxyChannel(ProxyChannel proxyChan, MemoryId dst, MemoryId src);
-
-  /// Constructor.
-  /// @param proxyChan The proxy channel.
-  SimpleProxyChannel(ProxyChannel proxyChan) : proxyChan_(proxyChan) {}
+  ProxyChannel(SemaphoreId semaphoreId, std::shared_ptr<Host2DeviceSemaphore> semaphore, std::shared_ptr<Proxy> proxy,
+               MemoryId dst, MemoryId src);
 
   /// Copy constructor.
-  SimpleProxyChannel(const SimpleProxyChannel& other) = default;
+  ProxyChannel(const ProxyChannel& other) = default;
 
   /// Assignment operator.
-  SimpleProxyChannel& operator=(SimpleProxyChannel& other) = default;
+  ProxyChannel& operator=(ProxyChannel& other) = default;
 
-  /// Device-side handle for @ref SimpleProxyChannel.
-  using DeviceHandle = SimpleProxyChannelDeviceHandle;
+  /// Device-side handle for @ref ProxyChannel.
+  using DeviceHandle = ProxyChannelDeviceHandle;
 
   /// Returns the device-side handle.
   ///
-  /// User should make sure the SimpleProxyChannel is not released when using the returned handle.
+  /// User should make sure the ProxyChannel is not released when using the returned handle.
   ///
   DeviceHandle deviceHandle() const;
 };