diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index d03e6fdcc..b1fabdf21 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -147,10 +147,8 @@ static std::shared_ptr> setupSmChannel [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); std::shared_ptr> ptr = mscclpp::allocSharedCuda>(smChannelDeviceHandles.size()); - mscclpp::AvoidCudaGraphCaptureGuard guard; - CUDACHECK(cudaMemcpy(ptr.get(), smChannelDeviceHandles.data(), - sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size(), - cudaMemcpyHostToDevice)); + mscclpp::memcpyCuda>(ptr.get(), smChannelDeviceHandles.data(), + smChannelDeviceHandles.size(), cudaMemcpyHostToDevice); return ptr; } diff --git a/include/mscclpp/gpu.hpp b/include/mscclpp/gpu.hpp index 01f875099..8e9e17ab5 100644 --- a/include/mscclpp/gpu.hpp +++ b/include/mscclpp/gpu.hpp @@ -6,7 +6,7 @@ #if defined(__HIP_PLATFORM_AMD__) -#include +// #include #include #include diff --git a/src/fifo.cc b/src/fifo.cc index 4255bcdcd..592bf7d00 100644 --- a/src/fifo.cc +++ b/src/fifo.cc @@ -56,6 +56,7 @@ MSCCLPP_API_CPP void Fifo::pop() { MSCCLPP_API_CPP void Fifo::flushTail(bool sync) { // Flush the tail to device memory. This is either triggered every ProxyFlushPeriod to make sure that the fifo can // make progress even if there is no request mscclppSync. However, mscclppSync type is for flush request. + AvoidCudaGraphCaptureGuard cgcGuard; MSCCLPP_CUDATHROW(cudaMemcpyAsync(pimpl->tailReplica.get(), &pimpl->hostTail, sizeof(uint64_t), cudaMemcpyHostToDevice, pimpl->stream)); if (sync) {