Merge branch 'main' into binyli/fix

microsoft · Oct 23, 2023 · a7b5513 · a7b5513
2 parents f8899ef + f688204
commit a7b5513
Show file tree

Hide file tree

Showing 5 changed files with 28 additions and 9 deletions.
diff --git a/.github/workflows/codeql.yml → .github/workflows/codeql-analysis.yml b/.github/workflows/codeql.yml → .github/workflows/codeql-analysis.yml
@@ -2,8 +2,9 @@ name: "CodeQL"
 
 on:
   push:
-    branches:
-      - '**'
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
   schedule:
     - cron: "30 1 * * 1"
 
@@ -42,8 +43,10 @@ jobs:
       run: |
         git config --global --add safe.directory /__w/mscclpp/mscclpp
 
-    - name: Autobuild
-      uses: github/codeql-action/autobuild@v2
+    - name: Build
+      run: |
+        MPI_HOME=/usr/local/mpi cmake -DBYPASS_PEERMEM_CHECK=ON .
+        make -j
 
     - name: Perform CodeQL Analysis
       uses: github/codeql-action/analyze@v2

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -26,6 +26,7 @@ option(USE_NPKIT "Use NPKIT" ON)
 option(BUILD_TESTS "Build tests" ON)
 option(BUILD_PYTHON_BINDINGS "Build Python bindings" ON)
 option(ALLOW_GDRCOPY "Use GDRCopy, if available" OFF)
+option(BYPASS_PEERMEM_CHECK "Bypass checking nvidia_peermem" OFF)
 
 # Find CUDAToolkit. Set CUDA flags based on the detected CUDA version
 find_package(CUDAToolkit REQUIRED)
@@ -44,6 +45,16 @@ if(CUDAToolkit_FOUND)
 endif()
 set(CUDA_LIBRARIES CUDA::cudart CUDA::cuda_driver)
 
+# Find if nvidia_peermem is installed and loaded
+if(NOT BYPASS_PEERMEM_CHECK)
+    execute_process(COMMAND sh -c "lsmod | grep nvidia_peermem"
+                    RESULT_VARIABLE lsmod_result
+                    OUTPUT_VARIABLE lsmod_output)
+    if(NOT lsmod_result EQUAL 0)
+        message(FATAL_ERROR "nvidia_peermem is not installed or not loaded.")
+    endif()
+endif()
+
 # Find ibverbs and libnuma
 find_package(IBVerbs REQUIRED)
 find_package(NUMA REQUIRED)

diff --git a/docs/quickstart.md b/docs/quickstart.md
@@ -11,7 +11,12 @@
     * NVIDIA A100 GPUs + CUDA >= 11.1.1
     * NVIDIA H100 GPUs + CUDA >= 12.0.0
 * OS: tested over Ubuntu 18.04 and 20.04
-* Libraries: [libnuma](https://github.com/numactl/numactl), [GDRCopy](https://github.com/NVIDIA/gdrcopy) (optional), MPI (optional)
+* Libraries: [libnuma](https://github.com/numactl/numactl), MPI (optional)
+* Others
+    * `nvidia_peermem` driver should be loaded on all nodes. Check it via:
+        ```
+        lsmod | grep nvidia_peermem
+        ```
 
 ## Build from Source
 

diff --git a/include/mscclpp/poll.hpp b/include/mscclpp/poll.hpp
@@ -17,7 +17,7 @@ extern "C" __device__ void __assert_fail(const char *__assertion, const char *__
     int64_t __spin_cnt = 0;                                           \
     __status = 0;                                                     \
     while (__cond) {                                                  \
-      if (__spin_cnt++ == __max_spin_cnt) {                           \
+      if (__max_spin_cnt >= 0 && __spin_cnt++ == __max_spin_cnt) {    \
         __status = 1;                                                 \
         break;                                                        \
       }                                                               \
@@ -29,7 +29,7 @@ extern "C" __device__ void __assert_fail(const char *__assertion, const char *__
   do {                                                                   \
     int64_t __spin_cnt = 0;                                              \
     while (__cond) {                                                     \
-      if (__spin_cnt++ == __max_spin_cnt) {                              \
+      if (__max_spin_cnt >= 0 && __spin_cnt++ == __max_spin_cnt) {       \
         __assert_fail(#__cond, __FILE__, __LINE__, __PRETTY_FUNCTION__); \
       }                                                                  \
     }                                                                    \
@@ -46,7 +46,7 @@ extern "C" __device__ void __assert_fail(const char *__assertion, const char *__
       } else if (!(__cond2)) {                                                     \
         break;                                                                     \
       }                                                                            \
-      if (__spin_cnt++ == __max_spin_cnt) {                                        \
+      if (__max_spin_cnt >= 0 && __spin_cnt++ == __max_spin_cnt) {                 \
         __assert_fail(#__cond1 #__cond2, __FILE__, __LINE__, __PRETTY_FUNCTION__); \
       }                                                                            \
     }                                                                              \

diff --git a/src/semaphore.cc b/src/semaphore.cc
@@ -78,7 +78,7 @@ MSCCLPP_API_CPP void Host2HostSemaphore::wait(int64_t maxSpinCount) {
   int64_t spinCount = 0;
   while (cuda::atomic_ref<uint64_t, cuda::thread_scope_system>{*(uint64_t*)localInboundSemaphore_.get()}.load(
              cuda::memory_order_acquire) < (*expectedInboundSemaphore_)) {
-    if (spinCount++ == maxSpinCount) {
+    if (maxSpinCount >= 0 && spinCount++ == maxSpinCount) {
       throw Error("Host2HostSemaphore::wait timed out", ErrorCode::Timeout);
     }
   }