diff --git a/docs/getting-started/quickstart.md b/docs/getting-started/quickstart.md
index 8c0982e3e..4ec1ba912 100644
--- a/docs/getting-started/quickstart.md
+++ b/docs/getting-started/quickstart.md
@@ -19,6 +19,10 @@
         ```
         lsmod | grep nvidia_peermem
         ```
+    * For GPU with nvls support, the IMEX channels should be set up. You can set up the channels manually via:
+        ```
+        sudo nvidia-modprobe -s -i <start:number of minors>
+        ```
 
 ## Build with Docker Images
 
diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
index cbfa882de..864c9f532 100644
--- a/include/mscclpp/gpu_utils.hpp
+++ b/include/mscclpp/gpu_utils.hpp
@@ -33,18 +33,6 @@
     }                                                                                                             \
   } while (false)
 
-#define MSCCLPP_CULOG_WARN(cmd)                             \
-  do {                                                      \
-    CUresult err = cmd;                                     \
-    if (err != CUDA_SUCCESS) {                              \
-      const char* errStr;                                   \
-      if (cuGetErrorString(err, &errStr) != CUDA_SUCCESS) { \
-        errStr = "failed to get error string";              \
-      }                                                     \
-      WARN("Call to " #cmd " failed, error is %s", errStr); \
-    }                                                       \
-  } while (false)
-
 namespace mscclpp {
 
 /// set memory access permission to read-write
diff --git a/include/mscclpp/nvls.hpp b/include/mscclpp/nvls.hpp
index 126afa6e5..36ad614ba 100644
--- a/include/mscclpp/nvls.hpp
+++ b/include/mscclpp/nvls.hpp
@@ -39,6 +39,11 @@ class NvlsConnection {
     friend class NvlsConnection;
   };
 
+  /// @brief bind the allocated memory via @ref mscclpp::allocSharedPhysicalCuda to the multicast handle. The behavior
+  /// is undefined if the devicePtr is not allocated by @ref mscclpp::allocSharedPhysicalCuda.
+  /// @param devicePtr
+  /// @param size
+  /// @return DeviceMulticastPointer with devicePtr, mcPtr and bufferSize
   DeviceMulticastPointer bindAllocatedMemory(CUdeviceptr devicePtr, size_t size);
 
   size_t getMultiCastMinGranularity();
diff --git a/python/mscclpp_benchmark/mscclpp_op.py b/python/mscclpp_benchmark/mscclpp_op.py
index db8bc9581..88840a743 100644
--- a/python/mscclpp_benchmark/mscclpp_op.py
+++ b/python/mscclpp_benchmark/mscclpp_op.py
@@ -448,9 +448,10 @@ def __init__(
             buffer_raw.get_ptr(), aligned_buffer_size
         )  # just using recommended size for now
         self.memory_ptr = self.nvls_mem_handle.get_device_ptr()
-        self.buffer_raw = buffer_raw
 
-        self.cp_memory_ptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(self.memory_ptr, aligned_buffer_size, None), 0)
+        self.cp_memory_ptr = cp.cuda.MemoryPointer(
+            cp.cuda.UnownedMemory(self.memory_ptr, aligned_buffer_size, buffer_raw), 0
+        )
         self.memory = cp.ndarray(nelem, memory_dtype, self.cp_memory_ptr)
 
         # create a sm_channel for each remote neighbor
diff --git a/python/test/executor_test.py b/python/test/executor_test.py
index 29ad804fc..60cf36b95 100644
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -243,7 +243,7 @@ def main(
     parser.add_argument("-n", "--execution_plan_name", type=str, required=True)
     parser.add_argument("-path", "--execution_plan_path", type=str, required=True)
     parser.add_argument("--size", type=str, required=True)
-    parser.add_argument("--in_place", action="store_true", help="flag to define an in-place operation", default=True)
+    parser.add_argument("--in_place", action="store_true", help="flag to define an in-place operation")
     parser.add_argument("--dtype", type=str, default="float16", help="Choose from float16, float32, int32")
     parser.add_argument("--packet_type", type=str, default="LL16", help="Choose from LL8, LL16")
     parser.add_argument("--n_iters", type=int, default=10)
diff --git a/src/registered_memory.cc b/src/registered_memory.cc
index 4412c5aae..1ad97c1b2 100644
--- a/src/registered_memory.cc
+++ b/src/registered_memory.cc
@@ -11,6 +11,18 @@
 #include "debug.h"
 #include "utils_internal.hpp"
 
+#define MSCCLPP_CULOG_WARN(cmd)                             \
+  do {                                                      \
+    CUresult err = cmd;                                     \
+    if (err != CUDA_SUCCESS) {                              \
+      const char* errStr;                                   \
+      if (cuGetErrorString(err, &errStr) != CUDA_SUCCESS) { \
+        errStr = "failed to get error string";              \
+      }                                                     \
+      WARN("Call to " #cmd " failed, error is %s", errStr); \
+    }                                                       \
+  } while (false)
+
 namespace {
 // Get the recommended granularity for cuMemAddressReserve
 size_t getRecommendedGranularity() {