diff --git a/docker/build.sh b/docker/build.sh
index 3e2169f68..af4a23025 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -9,7 +9,7 @@ baseImageTable=(
     ["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04"
     ["cuda12.3"]="nvidia/cuda:12.3.2-devel-ubuntu20.04"
     ["cuda12.4"]="nvidia/cuda:12.4.1-devel-ubuntu22.04"
-    ["rocm6.2"]="rocm/rocm-terminal:6.2"
+    ["rocm6.2"]="rocm/rocm-terminal:6.2.1"
 )
 
 declare -A extraLdPathTable
diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
index 8bdf3587c..4baf8af2d 100644
--- a/include/mscclpp/gpu_utils.hpp
+++ b/include/mscclpp/gpu_utils.hpp
@@ -386,7 +386,7 @@ UniqueCudaHostPtr<T> makeUniqueCudaHost(size_t count) {
 /// @param gran the granularity of the allocation.
 /// @return A std::unique_ptr to the allocated memory.
 template <class T>
-std::unique_ptr<T> allocUniquePhysicalCuda(size_t count, size_t gran = 0) {
+std::unique_ptr<T> allocUniquePhysicalCuda([[maybe_unused]] size_t count, [[maybe_unused]] size_t gran = 0) {
 #if (CUDA_FABRIC_SUPPORTED)
   if (!isFabricSupported()) {
     throw Error("Only suupport GPU with Fabric support", ErrorCode::InvalidUsage);
diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc
index 785ecbc03..99d8f213a 100644
--- a/src/executor/execution_plan.cc
+++ b/src/executor/execution_plan.cc
@@ -106,7 +106,7 @@ std::vector<ChannelInfo> ExecutionPlan::Impl::getChannelInfos(int rank, BufferTy
 }
 
 std::vector<ChannelInfo> ExecutionPlan::Impl::getChannelInfosByDstRank(int rank, BufferType bufferType) const {
-  auto pred = [rank, bufferType](const ChannelInfo& info) { return info.dstBufferType == bufferType; };
+  auto pred = [bufferType](const ChannelInfo& info) { return info.dstBufferType == bufferType; };
   return filter(this->channelInfosByDstRank.at(rank), pred);
 }
 
diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp
index 1cc799e7a..1e9d6ac57 100644
--- a/src/include/execution_kernel.hpp
+++ b/src/include/execution_kernel.hpp
@@ -430,6 +430,7 @@ MSCCLPP_DEVICE_INLINE void handleCopy(void* dst, void* src, uint32_t dstOffset,
   Element::copy(dstData, srcData, size, threadIdx.x, blockDim.x);
 }
 
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
 template <typename T>
 MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(T* dst, T* src, uint32_t dstOffset, uint32_t srcOffset,
                                                       size_t size) {
@@ -458,6 +459,7 @@ MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(T* dst, T* src, uint32_t d
     DeviceMulticastPointerDeviceHandle::multimemStore(val, (vectorType*)dst + idx);
   }
 }
+#endif
 
 template <typename T, typename PacketType = LL16Packet>
 __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* input, T* output, T* scratch,
diff --git a/src/registered_memory.cc b/src/registered_memory.cc
index 114557892..2396cd94d 100644
--- a/src/registered_memory.cc
+++ b/src/registered_memory.cc
@@ -19,7 +19,7 @@ bool isCuMemMapAllocated(void* ptr) {
   if (result != CUDA_SUCCESS) {
     return false;
   }
-  cuMemRelease(handle);
+  MSCCLPP_CUTHROW(cuMemRelease(handle));
   return true;
 }
 
diff --git a/test/nvls_test.cu b/test/nvls_test.cu
index 42aefdc2d..7a1a54ade 100644
--- a/test/nvls_test.cu
+++ b/test/nvls_test.cu
@@ -1,9 +1,6 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include <cuda.h>
-#include <cudaTypedefs.h>
-#include <cuda_runtime.h>
 #include <mpi.h>
 #include <stdio.h>
 #include <sys/syscall.h>
@@ -12,6 +9,9 @@
 
 #include <mscclpp/gpu.hpp>
 #if (CUDA_NVLS_SUPPORTED)
+#include <cuda.h>
+#include <cudaTypedefs.h>
+#include <cuda_runtime.h>
 
 #define CUCHECK(cmd)                                     \
   do {                                                   \