diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp index bfe028852e942a..4d0b9f17751fae 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp @@ -1243,8 +1243,8 @@ void ProcessGroupNCCL::waitForFutureOrTimeout( try { bool result = fut.get(); if (result) { - LOG(INFO) << logPrefix() - << "future is successfully executed for: " << futDescription; + VLOG(2) << logPrefix() + << "future is successfully executed for: " << futDescription; if (log) { data.strings["status"] = "SUCCESS"; } @@ -1311,8 +1311,9 @@ void ProcessGroupNCCL::abortCommsFromMap( // TODO: fix `getIndexFromDeviceKey` or fix `DeviceKey` gpuGuard.set_index(deviceIndex); } - LOG(INFO) << logPrefix() << "ProcessGroupNCCL destroying ncclComm_ " - << ncclComm->repr() << " on CUDA device: " << devName; + + VLOG(2) << logPrefix() << "ProcessGroupNCCL destroying ncclComm_ " + << ncclComm->repr() << " on CUDA device: " << devName; ncclComm->ncclCommAbort(abortReason); // Note that we don't remove the aborted communicators from the // cache. The reason is that if we do remove the communicator @@ -1324,8 +1325,8 @@ void ProcessGroupNCCL::abortCommsFromMap( // their responsibility to destroy the process group and recreate // it to recover from errors. - LOG(INFO) << logPrefix() << "ProcessGroupNCCL destroyed " - << " communicator on CUDA device: " << devName; + VLOG(2) << logPrefix() << "ProcessGroupNCCL destroyed " + << " communicator on CUDA device: " << devName; } } @@ -1391,7 +1392,7 @@ void ProcessGroupNCCL::shutdown() { // NOLINTNEXTLINE(bugprone-exception-escape) ProcessGroupNCCL::~ProcessGroupNCCL() { - LOG(INFO) << logPrefix() << "ProcessGroupNCCL destructor entered."; + VLOG(2) << logPrefix() << "ProcessGroupNCCL destructor entered."; if (!terminateProcessGroup_.load()) { if (rank_ % localDeviceCount_ == 0) { @@ -1413,19 +1414,19 @@ ProcessGroupNCCL::~ProcessGroupNCCL() { if (!blockingWait_) { if (ncclCommWatchdogThread_.joinable()) { ncclCommWatchdogThread_.join(); - LOG(INFO) << logPrefix() << "ProcessGroupNCCL watchdog thread joined."; + VLOG(2) << logPrefix() << "ProcessGroupNCCL watchdog thread joined."; } if (ncclHeartbeatMonitorThread_.joinable()) { ncclHeartbeatMonitorThread_.join(); - LOG(INFO) << logPrefix() - << "ProcessGroupNCCL heart beat monitor thread joined."; + VLOG(2) << logPrefix() + << "ProcessGroupNCCL heart beat monitor thread joined."; } } #endif if (onCompletionHookThread_.joinable()) { onCompletionHookThread_.join(); - LOG(INFO) << logPrefix() - << "ProcessGroupNCCL onCompletionHookThread thread joined."; + VLOG(2) << logPrefix() + << "ProcessGroupNCCL onCompletionHookThread thread joined."; } } @@ -1673,7 +1674,7 @@ void ProcessGroupNCCL::heartbeatMonitor() { << "Could not acquire GIL within 300 ms on exit, possible GIL induced hang"; } } else { - LOG(INFO) + VLOG(2) << logPrefix() << "GIL checker was not registered, perhaps this is a no-python build?"; } @@ -1748,7 +1749,7 @@ void ProcessGroupNCCL::ncclCommWatchdog() { } catch (std::exception& e) { if (std::string(e.what()).find("driver shutting down") != std::string::npos) { - LOG(INFO) + VLOG(2) << logPrefix() << "main process destroyed cuda before watchdog loop exited, terminating watchdog." << " (Watchdog caught exception: " << e.what(); @@ -2481,9 +2482,9 @@ std::shared_ptr ProcessGroupNCCL::initNCCLComm( globalRankStride, // globalRankStride size_); // worldSize - LOG(INFO) << logPrefix() << "ProcessGroupNCCL created ncclComm_ " - << ncclComm->repr() - << " on CUDA device: " << static_cast(deviceIndex); + VLOG(2) << logPrefix() << "ProcessGroupNCCL created ncclComm_ " + << ncclComm->repr() + << " on CUDA device: " << static_cast(deviceIndex); // At this point NCCL should have been initialized, hence we can accurately // get the env value even if NCCL sets it by reading from nccl.conf file