Skip to content

Commit

Permalink
Merge branch 'main' into binyli/fix
Browse files Browse the repository at this point in the history
  • Loading branch information
chhwang authored Oct 23, 2023
2 parents f8899ef + f688204 commit a7b5513
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@ name: "CodeQL"

on:
push:
branches:
- '**'
branches: [ main ]
pull_request:
branches: [ main ]
schedule:
- cron: "30 1 * * 1"

Expand Down Expand Up @@ -42,8 +43,10 @@ jobs:
run: |
git config --global --add safe.directory /__w/mscclpp/mscclpp
- name: Autobuild
uses: github/codeql-action/autobuild@v2
- name: Build
run: |
MPI_HOME=/usr/local/mpi cmake -DBYPASS_PEERMEM_CHECK=ON .
make -j
- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v2
Expand Down
11 changes: 11 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ option(USE_NPKIT "Use NPKIT" ON)
option(BUILD_TESTS "Build tests" ON)
option(BUILD_PYTHON_BINDINGS "Build Python bindings" ON)
option(ALLOW_GDRCOPY "Use GDRCopy, if available" OFF)
option(BYPASS_PEERMEM_CHECK "Bypass checking nvidia_peermem" OFF)

# Find CUDAToolkit. Set CUDA flags based on the detected CUDA version
find_package(CUDAToolkit REQUIRED)
Expand All @@ -44,6 +45,16 @@ if(CUDAToolkit_FOUND)
endif()
set(CUDA_LIBRARIES CUDA::cudart CUDA::cuda_driver)

# Find if nvidia_peermem is installed and loaded
if(NOT BYPASS_PEERMEM_CHECK)
execute_process(COMMAND sh -c "lsmod | grep nvidia_peermem"
RESULT_VARIABLE lsmod_result
OUTPUT_VARIABLE lsmod_output)
if(NOT lsmod_result EQUAL 0)
message(FATAL_ERROR "nvidia_peermem is not installed or not loaded.")
endif()
endif()

# Find ibverbs and libnuma
find_package(IBVerbs REQUIRED)
find_package(NUMA REQUIRED)
Expand Down
7 changes: 6 additions & 1 deletion docs/quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,12 @@
* NVIDIA A100 GPUs + CUDA >= 11.1.1
* NVIDIA H100 GPUs + CUDA >= 12.0.0
* OS: tested over Ubuntu 18.04 and 20.04
* Libraries: [libnuma](https://github.com/numactl/numactl), [GDRCopy](https://github.com/NVIDIA/gdrcopy) (optional), MPI (optional)
* Libraries: [libnuma](https://github.com/numactl/numactl), MPI (optional)
* Others
* `nvidia_peermem` driver should be loaded on all nodes. Check it via:
```
lsmod | grep nvidia_peermem
```

## Build from Source

Expand Down
6 changes: 3 additions & 3 deletions include/mscclpp/poll.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ extern "C" __device__ void __assert_fail(const char *__assertion, const char *__
int64_t __spin_cnt = 0; \
__status = 0; \
while (__cond) { \
if (__spin_cnt++ == __max_spin_cnt) { \
if (__max_spin_cnt >= 0 && __spin_cnt++ == __max_spin_cnt) { \
__status = 1; \
break; \
} \
Expand All @@ -29,7 +29,7 @@ extern "C" __device__ void __assert_fail(const char *__assertion, const char *__
do { \
int64_t __spin_cnt = 0; \
while (__cond) { \
if (__spin_cnt++ == __max_spin_cnt) { \
if (__max_spin_cnt >= 0 && __spin_cnt++ == __max_spin_cnt) { \
__assert_fail(#__cond, __FILE__, __LINE__, __PRETTY_FUNCTION__); \
} \
} \
Expand All @@ -46,7 +46,7 @@ extern "C" __device__ void __assert_fail(const char *__assertion, const char *__
} else if (!(__cond2)) { \
break; \
} \
if (__spin_cnt++ == __max_spin_cnt) { \
if (__max_spin_cnt >= 0 && __spin_cnt++ == __max_spin_cnt) { \
__assert_fail(#__cond1 #__cond2, __FILE__, __LINE__, __PRETTY_FUNCTION__); \
} \
} \
Expand Down
2 changes: 1 addition & 1 deletion src/semaphore.cc
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ MSCCLPP_API_CPP void Host2HostSemaphore::wait(int64_t maxSpinCount) {
int64_t spinCount = 0;
while (cuda::atomic_ref<uint64_t, cuda::thread_scope_system>{*(uint64_t*)localInboundSemaphore_.get()}.load(
cuda::memory_order_acquire) < (*expectedInboundSemaphore_)) {
if (spinCount++ == maxSpinCount) {
if (maxSpinCount >= 0 && spinCount++ == maxSpinCount) {
throw Error("Host2HostSemaphore::wait timed out", ErrorCode::Timeout);
}
}
Expand Down

0 comments on commit a7b5513

Please sign in to comment.