diff --git a/docker/build.sh b/docker/build.sh index 3e2169f68..af4a23025 100755 --- a/docker/build.sh +++ b/docker/build.sh @@ -9,7 +9,7 @@ baseImageTable=( ["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04" ["cuda12.3"]="nvidia/cuda:12.3.2-devel-ubuntu20.04" ["cuda12.4"]="nvidia/cuda:12.4.1-devel-ubuntu22.04" - ["rocm6.2"]="rocm/rocm-terminal:6.2" + ["rocm6.2"]="rocm/rocm-terminal:6.2.1" ) declare -A extraLdPathTable diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp index 8bdf3587c..4baf8af2d 100644 --- a/include/mscclpp/gpu_utils.hpp +++ b/include/mscclpp/gpu_utils.hpp @@ -386,7 +386,7 @@ UniqueCudaHostPtr makeUniqueCudaHost(size_t count) { /// @param gran the granularity of the allocation. /// @return A std::unique_ptr to the allocated memory. template -std::unique_ptr allocUniquePhysicalCuda(size_t count, size_t gran = 0) { +std::unique_ptr allocUniquePhysicalCuda([[maybe_unused]] size_t count, [[maybe_unused]] size_t gran = 0) { #if (CUDA_FABRIC_SUPPORTED) if (!isFabricSupported()) { throw Error("Only suupport GPU with Fabric support", ErrorCode::InvalidUsage); diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc index 785ecbc03..99d8f213a 100644 --- a/src/executor/execution_plan.cc +++ b/src/executor/execution_plan.cc @@ -106,7 +106,7 @@ std::vector ExecutionPlan::Impl::getChannelInfos(int rank, BufferTy } std::vector ExecutionPlan::Impl::getChannelInfosByDstRank(int rank, BufferType bufferType) const { - auto pred = [rank, bufferType](const ChannelInfo& info) { return info.dstBufferType == bufferType; }; + auto pred = [bufferType](const ChannelInfo& info) { return info.dstBufferType == bufferType; }; return filter(this->channelInfosByDstRank.at(rank), pred); } diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp index 1cc799e7a..1e9d6ac57 100644 --- a/src/include/execution_kernel.hpp +++ b/src/include/execution_kernel.hpp @@ -430,6 +430,7 @@ MSCCLPP_DEVICE_INLINE void handleCopy(void* dst, void* src, uint32_t dstOffset, Element::copy(dstData, srcData, size, threadIdx.x, blockDim.x); } +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 template MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(T* dst, T* src, uint32_t dstOffset, uint32_t srcOffset, size_t size) { @@ -458,6 +459,7 @@ MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(T* dst, T* src, uint32_t d DeviceMulticastPointerDeviceHandle::multimemStore(val, (vectorType*)dst + idx); } } +#endif template __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* input, T* output, T* scratch, diff --git a/src/registered_memory.cc b/src/registered_memory.cc index 114557892..2396cd94d 100644 --- a/src/registered_memory.cc +++ b/src/registered_memory.cc @@ -19,7 +19,7 @@ bool isCuMemMapAllocated(void* ptr) { if (result != CUDA_SUCCESS) { return false; } - cuMemRelease(handle); + MSCCLPP_CUTHROW(cuMemRelease(handle)); return true; } diff --git a/test/nvls_test.cu b/test/nvls_test.cu index 42aefdc2d..7a1a54ade 100644 --- a/test/nvls_test.cu +++ b/test/nvls_test.cu @@ -1,9 +1,6 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#include -#include -#include #include #include #include @@ -12,6 +9,9 @@ #include #if (CUDA_NVLS_SUPPORTED) +#include +#include +#include #define CUCHECK(cmd) \ do { \