diff --git a/docs/getting-started/quickstart.md b/docs/getting-started/quickstart.md index 8c0982e3e..4ec1ba912 100644 --- a/docs/getting-started/quickstart.md +++ b/docs/getting-started/quickstart.md @@ -19,6 +19,10 @@ ``` lsmod | grep nvidia_peermem ``` + * For GPU with nvls support, the IMEX channels should be set up. You can set up the channels manually via: + ``` + sudo nvidia-modprobe -s -i + ``` ## Build with Docker Images diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp index cbfa882de..864c9f532 100644 --- a/include/mscclpp/gpu_utils.hpp +++ b/include/mscclpp/gpu_utils.hpp @@ -33,18 +33,6 @@ } \ } while (false) -#define MSCCLPP_CULOG_WARN(cmd) \ - do { \ - CUresult err = cmd; \ - if (err != CUDA_SUCCESS) { \ - const char* errStr; \ - if (cuGetErrorString(err, &errStr) != CUDA_SUCCESS) { \ - errStr = "failed to get error string"; \ - } \ - WARN("Call to " #cmd " failed, error is %s", errStr); \ - } \ - } while (false) - namespace mscclpp { /// set memory access permission to read-write diff --git a/include/mscclpp/nvls.hpp b/include/mscclpp/nvls.hpp index 126afa6e5..36ad614ba 100644 --- a/include/mscclpp/nvls.hpp +++ b/include/mscclpp/nvls.hpp @@ -39,6 +39,11 @@ class NvlsConnection { friend class NvlsConnection; }; + /// @brief bind the allocated memory via @ref mscclpp::allocSharedPhysicalCuda to the multicast handle. The behavior + /// is undefined if the devicePtr is not allocated by @ref mscclpp::allocSharedPhysicalCuda. + /// @param devicePtr + /// @param size + /// @return DeviceMulticastPointer with devicePtr, mcPtr and bufferSize DeviceMulticastPointer bindAllocatedMemory(CUdeviceptr devicePtr, size_t size); size_t getMultiCastMinGranularity(); diff --git a/python/mscclpp_benchmark/mscclpp_op.py b/python/mscclpp_benchmark/mscclpp_op.py index db8bc9581..88840a743 100644 --- a/python/mscclpp_benchmark/mscclpp_op.py +++ b/python/mscclpp_benchmark/mscclpp_op.py @@ -448,9 +448,10 @@ def __init__( buffer_raw.get_ptr(), aligned_buffer_size ) # just using recommended size for now self.memory_ptr = self.nvls_mem_handle.get_device_ptr() - self.buffer_raw = buffer_raw - self.cp_memory_ptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(self.memory_ptr, aligned_buffer_size, None), 0) + self.cp_memory_ptr = cp.cuda.MemoryPointer( + cp.cuda.UnownedMemory(self.memory_ptr, aligned_buffer_size, buffer_raw), 0 + ) self.memory = cp.ndarray(nelem, memory_dtype, self.cp_memory_ptr) # create a sm_channel for each remote neighbor diff --git a/python/test/executor_test.py b/python/test/executor_test.py index 29ad804fc..60cf36b95 100644 --- a/python/test/executor_test.py +++ b/python/test/executor_test.py @@ -243,7 +243,7 @@ def main( parser.add_argument("-n", "--execution_plan_name", type=str, required=True) parser.add_argument("-path", "--execution_plan_path", type=str, required=True) parser.add_argument("--size", type=str, required=True) - parser.add_argument("--in_place", action="store_true", help="flag to define an in-place operation", default=True) + parser.add_argument("--in_place", action="store_true", help="flag to define an in-place operation") parser.add_argument("--dtype", type=str, default="float16", help="Choose from float16, float32, int32") parser.add_argument("--packet_type", type=str, default="LL16", help="Choose from LL8, LL16") parser.add_argument("--n_iters", type=int, default=10) diff --git a/src/registered_memory.cc b/src/registered_memory.cc index 4412c5aae..1ad97c1b2 100644 --- a/src/registered_memory.cc +++ b/src/registered_memory.cc @@ -11,6 +11,18 @@ #include "debug.h" #include "utils_internal.hpp" +#define MSCCLPP_CULOG_WARN(cmd) \ + do { \ + CUresult err = cmd; \ + if (err != CUDA_SUCCESS) { \ + const char* errStr; \ + if (cuGetErrorString(err, &errStr) != CUDA_SUCCESS) { \ + errStr = "failed to get error string"; \ + } \ + WARN("Call to " #cmd " failed, error is %s", errStr); \ + } \ + } while (false) + namespace { // Get the recommended granularity for cuMemAddressReserve size_t getRecommendedGranularity() {