Merge branch 'main' into v0.3.0

microsoft · Oct 8, 2023 · 9de93d3 · 9de93d3
2 parents 6703962 + 11ac824
commit 9de93d3
Show file tree

Hide file tree

Showing 15 changed files with 280 additions and 90 deletions.
diff --git a/.github/workflows/integration-test-backup.yml b/.github/workflows/integration-test-backup.yml
@@ -0,0 +1,78 @@
+name: IntegrationTest
+
+on: workflow_dispatch
+
+jobs:
+  IntegrationTest:
+    runs-on: self-hosted
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        cuda: [ cuda11.8, cuda12.1 ]
+
+    container:
+      image: "ghcr.io/microsoft/mscclpp/mscclpp:base-${{ matrix.cuda }}"
+      options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Install CMake
+        run: |
+          curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
+          tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
+
+      - name: Build
+        run: |
+          mkdir build && cd build
+          MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release ..
+          make -j
+
+      - name: Lock GPU clock frequency
+        run: |
+          sudo nvidia-smi -pm 1
+          for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
+            sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
+          done
+
+      - name: Run mscclpp AllGather test
+        run: |
+          set -e
+          export PATH=/usr/local/mpi/bin:$PATH
+          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
+          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
+          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
+
+      - name: Run mscclpp SendRecv test
+        run: |
+          set -e
+          export PATH=/usr/local/mpi/bin:$PATH
+          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+
+      - name: Run mscclpp AllReduce test
+        run: |
+          set -e
+          export PATH=/usr/local/mpi/bin:$PATH
+          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
+          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
+          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
+          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl
+          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
+          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl
+
+      - name: Run mscclpp AllToAll test
+        run: |
+          set -e
+          export PATH=/usr/local/mpi/bin:$PATH
+          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
+
+      - name: Check collective primitives performance
+        run: |
+          set -e
+          python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file test/deploy/perf_ndmv4.jsonl
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -38,12 +38,8 @@ jobs:
       - name: Install Python dependencies
         run: python3.8 -m pip install black
 
-      - name: Run linters
-        uses: wearerequired/lint-action@v2
-        with:
-          black: true
-          black_auto_fix: false
-          black_args: "--config pyproject.toml --check"
+      - name: Run black
+        run: python3.8 -m black --check --config pyproject.toml .
 
   spelling:
     runs-on: ubuntu-20.04

diff --git a/.github/workflows/ut-backup.yml b/.github/workflows/ut-backup.yml
@@ -0,0 +1,66 @@
+name: UnitTest
+
+on: workflow_dispatch
+
+jobs:
+  UnitTest:
+    runs-on: self-hosted
+    defaults:
+      run:
+        shell: bash
+    timeout-minutes: 30
+    strategy:
+      matrix:
+        cuda: [ cuda11.8, cuda12.1 ]
+
+    container:
+      image: "ghcr.io/microsoft/mscclpp/mscclpp:base-${{ matrix.cuda }}"
+      options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Build
+        run: |
+          curl -L -C- https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
+          tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
+          mkdir build && cd build
+          MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release ..
+          make -j
+        working-directory: ${{ github.workspace }}
+
+      - name: LockGPUClock
+        run: |
+          sudo nvidia-smi -pm 1
+          for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
+            sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
+          done
+        working-directory: ${{ github.workspace }}
+
+      - name: UnitTests
+        run: |
+          ./build/test/unit_tests
+        working-directory: ${{ github.workspace }}
+
+      - name: MpUnitTests
+        run: |
+          set -e
+          export PATH=/usr/local/mpi/bin:$PATH
+          mpirun --allow-run-as-root -tag-output -np 2 ./build/test/mp_unit_tests
+          mpirun --allow-run-as-root -tag-output -np 4 ./build/test/mp_unit_tests
+          mpirun --allow-run-as-root -tag-output -np 8 ./build/test/mp_unit_tests
+        working-directory: ${{ github.workspace }}
+
+      - name: PyTests
+        run: |
+          set -e
+          export PATH=/usr/local/mpi/bin:$PATH
+          cd build && make pylib-copy
+          if [[ '${{ matrix.cuda }}' == 'cuda11'* ]]; then
+            python3 -m pip install -r ../python/test/requirements_cu11.txt
+          else
+            python3 -m pip install -r ../python/test/requirements_cu12.txt
+          fi
+          mpirun --allow-run-as-root -tag-output -np 8 $(which pytest) ../python/test/test_mscclpp.py -x
+        working-directory: ${{ github.workspace }}
diff --git a/include/mscclpp/packet.hpp b/include/mscclpp/packet.hpp
@@ -48,7 +48,7 @@ union LLPacket {
   /// @param flag The flag to read.
   /// @param data The 8-byte data read.
   /// @return True if the flag is not equal to the given flag.
-  __forceinline__ __device__ bool readOnce(uint32_t flag, uint2& data) {
+  __forceinline__ __device__ bool readOnce(uint32_t flag, uint2& data) const {
     uint32_t flag1, flag2;
     asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];"
                  : "=r"(data.x), "=r"(flag1), "=r"(data.y), "=r"(flag2)
@@ -60,7 +60,7 @@ union LLPacket {
   /// @param flag The flag to read.
   /// @param maxSpinCount The maximum number of spin counts before asserting. Never assert if negative.
   /// @return The 8-byte data read.
-  __forceinline__ __device__ uint2 read(uint32_t flag, int64_t maxSpinCount = 100000000) {
+  __forceinline__ __device__ uint2 read(uint32_t flag, int64_t maxSpinCount = 100000000) const {
     uint2 data;
     POLL_MAYBE_JAILBREAK(readOnce(flag, data), maxSpinCount);
     return data;
@@ -75,28 +75,31 @@ union LLPacket {
 };
 
 #ifdef __CUDACC__
-__forceinline__ __device__ void putPackets(void* dst, uint64_t dstOffset, void* src, uint64_t srcOffset,
-                                           uint64_t srcBytes, uint32_t threadId, uint32_t numThreads, uint32_t flag) {
+/// Read from the origin and write to the target buffer.
+__forceinline__ __device__ void putPackets(void* targetPtr, uint64_t targetOffset, const void* originPtr,
+                                           uint64_t originOffset, uint64_t originBytes, uint32_t threadId,
+                                           uint32_t numThreads, uint32_t flag) {
   // Offsets should be aligned to 8 bytes & size should be a multiple of 8 bytes
-  uint32_t* srcBase = (uint32_t*)((char*)src + srcOffset);
-  LLPacket* dstBase = (LLPacket*)((char*)dst + dstOffset);
-  size_t nElem = srcBytes / sizeof(uint64_t);
+  const uint32_t* originBase = (const uint32_t*)((const char*)originPtr + originOffset);
+  LLPacket* targetBase = (LLPacket*)((char*)targetPtr + targetOffset);
+  size_t nElem = originBytes / sizeof(uint64_t);
   for (size_t i = threadId; i < nElem; i += numThreads) {
-    LLPacket* pkt = &dstBase[i];
-    pkt->write(srcBase[2 * i], srcBase[2 * i + 1], flag);
+    LLPacket* pkt = &targetBase[i];
+    pkt->write(originBase[2 * i], originBase[2 * i + 1], flag);
   }
 }
 
-__forceinline__ __device__ void getPackets(void* dst, uint64_t dstOffset, void* src, uint64_t srcOffset,
-                                           uint64_t dstBytes, uint32_t threadId, uint32_t numThreads, uint32_t flag) {
+/// Read from the target buffer and write to the origin.
+__forceinline__ __device__ void getPackets(const void* targetPtr, uint64_t targetOffset, void* originPtr,
+                                           uint64_t originOffset, uint64_t originBytes, uint32_t threadId,
+                                           uint32_t numThreads, uint32_t flag) {
   // Offsets should be aligned to 8 bytes & size should be a multiple of 8 bytes
-  // TODO(saemal): this is not matching sm_channel get method.
-  LLPacket* srcBase = (LLPacket*)((char*)src + srcOffset);
-  uint2* dstBase = (uint2*)((char*)dst + dstOffset);
-  size_t nElem = dstBytes / sizeof(uint2);
+  const LLPacket* targetBase = (const LLPacket*)((const char*)targetPtr + targetOffset);
+  uint2* originBase = (uint2*)((char*)originPtr + originOffset);
+  size_t nElem = originBytes / sizeof(uint2);
   for (size_t i = threadId; i < nElem; i += numThreads) {
-    LLPacket* pkt = &srcBase[i];
-    dstBase[i] = pkt->read(flag);
+    const LLPacket* pkt = &targetBase[i];
+    originBase[i] = pkt->read(flag);
   }
 }
 #endif  // __CUDACC__

diff --git a/include/mscclpp/proxy.hpp b/include/mscclpp/proxy.hpp
@@ -28,6 +28,9 @@ class Proxy {
   void start();
   void stop();
 
+  /// This is a concurrent fifo which is multiple threads from the device
+  /// can produce for and the sole proxy thread consumes it.
+  /// @return the fifo
   Fifo& fifo();
 
  private:

diff --git a/include/mscclpp/proxy_channel.hpp b/include/mscclpp/proxy_channel.hpp
@@ -62,7 +62,7 @@ class ProxyService : public BaseProxyService {
  private:
   std::vector<std::shared_ptr<Host2DeviceSemaphore>> semaphores_;
   std::vector<RegisteredMemory> memories_;
-  Proxy proxy_;
+  std::shared_ptr<Proxy> proxy_;
   int deviceNumaNode;
 
   void bindThread();
@@ -75,16 +75,14 @@ struct ProxyChannel {
  private:
   SemaphoreId semaphoreId_;
 
-  Host2DeviceSemaphore::DeviceHandle semaphore_;
+  std::shared_ptr<Host2DeviceSemaphore> semaphore_;
 
-  // this is a concurrent fifo which is multiple threads from the device
-  // can produce for and the sole proxy thread consumes it.
-  FifoDeviceHandle fifo_;
+  std::shared_ptr<Proxy> proxy_;
 
  public:
   ProxyChannel() = default;
 
-  ProxyChannel(SemaphoreId semaphoreId, Host2DeviceSemaphore::DeviceHandle semaphore, FifoDeviceHandle fifo);
+  ProxyChannel(SemaphoreId semaphoreId, std::shared_ptr<Host2DeviceSemaphore> semaphore, std::shared_ptr<Proxy> proxy);
 
   ProxyChannel(const ProxyChannel& other) = default;