diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp index 28c41c2ba0e034..a57ec699753716 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp @@ -1,6 +1,7 @@ #ifdef USE_C10D_XCCL #include +#include #include #include #include @@ -252,6 +253,25 @@ c10::intrusive_ptr ProcessGroupXCCL::allreduce( tensors.size() == 1, "Expecting one tensor only but got multiple"); auto tensor = tensors.back(); checkXPUTensor(tensor); + + RECORD_PARAM_COMMS_DATA( + // static_cast( + // this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective + 1, + std::make_tuple(pg_uid_, pg_desc_), // PG name tuple + tensors, // inputTensors + tensors, // outputTensors + rank_, // rank + "allreduce", // collective name + tensor.numel(), // inNelems + tensor.numel(), // outNelems + tensor.scalar_type(), // dType + std::vector(), // inSplitSizes + std::vector(), // outSplitSizes + 0, // globalRankStart + 1, // globalRankStride + this->getSize()); // worldSize + return collective( tensor, tensor,