From b442419da4f529083b418be8d6b5cd1769423390 Mon Sep 17 00:00:00 2001 From: hanchao Date: Tue, 12 Nov 2024 02:23:48 +0000 Subject: [PATCH] update kvs key --- torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp | 2 +- torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp | 13 +++++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 1527de6fe3f28..f202f8916f89f 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -239,7 +239,7 @@ ProcessGroupXCCL::ProcessGroupXCCL( const c10::intrusive_ptr& store, int rank, int size) - : Backend(rank, size), store_(store) { + : Backend(rank, size), store_(store), xcclCommCounter_(0) { blockingWait_ = getCvarBool(TORCH_XCCL_BLOCKING_WAIT, false); init(); } diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp index cbbd724f88c6b..c30ca603c7ba0 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp @@ -318,6 +318,7 @@ class TORCH_API ProcessGroupXCCL : public Backend { std::unordered_map xcclEventsMap_; std::unordered_map> devXCCLCommMap_; c10::intrusive_ptr store_; + uint64_t xcclCommCounter_{0}; std::mutex mutex_; std::set usedDeviceIdxs_; int coalescing_state_ = 0; @@ -331,15 +332,19 @@ class TORCH_API ProcessGroupXCCL : public Backend { private: std::mutex kvs_mutex; - ccl::shared_ptr_class get_kvs(int rank, c10d::Store& store, - bool singleP2POp = false, const std::string& p2pKey = "", int p2pRank = 0) { + ccl::shared_ptr_class get_kvs( + int rank, + c10d::Store& store, + bool singleP2POp = false, + const std::string& p2pKey = "", + int p2pRank = 0) { std::lock_guard lock(kvs_mutex); ccl::shared_ptr_class kvs; std::string storeKey; if (!singleP2POp) { - storeKey = "xccl_kvs"; + storeKey = std::to_string(xcclCommCounter_++); } else { - storeKey = p2pKey; + storeKey = p2pKey; } // Rank 0 broadcast the bootstrap network information to other ranks if (rank == 0 || (singleP2POp && p2pRank == 0)) {