From 31d2bab16bef4a0e2c470a758f74b3544a9abc45 Mon Sep 17 00:00:00 2001 From: hanchao Date: Mon, 2 Dec 2024 07:03:37 +0000 Subject: [PATCH] update --- caffe2/CMakeLists.txt | 1 + torch/csrc/distributed/c10d/ProcessGroup.hpp | 2 -- torch/distributed/distributed_c10d.py | 11 +++++++---- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index a47fcbd0d7c1a4..97cbf86b63b024 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -1104,6 +1104,7 @@ if(USE_XPU) message(WARNING "Failed to include ATen XPU implementation target") else() target_link_libraries(torch_xpu PRIVATE torch_xpu_ops) + # USE_C10D_XCCL from third_party torch-xpu-ops repository for xccl registration. if(USE_C10D_XCCL) target_compile_definitions(torch_xpu PUBLIC USE_C10D_XCCL) endif() diff --git a/torch/csrc/distributed/c10d/ProcessGroup.hpp b/torch/csrc/distributed/c10d/ProcessGroup.hpp index 1259813d5d7a5f..b3d9ce3b91d674 100644 --- a/torch/csrc/distributed/c10d/ProcessGroup.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroup.hpp @@ -641,7 +641,6 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder { // TODO: HACK for backend name to get sequence number for that backend. if (backendType == ProcessGroup::BackendType::GLOO || backendType == ProcessGroup::BackendType::NCCL || - backendType == ProcessGroup::BackendType::XCCL || backendType == ProcessGroup::BackendType::UCC) { getDefaultBackend()->setSequenceNumberForGroup(); } else { @@ -663,7 +662,6 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder { // TODO: HACK for backend name to get sequence number for that backend. if (backendType == ProcessGroup::BackendType::GLOO || backendType == ProcessGroup::BackendType::NCCL || - backendType == ProcessGroup::BackendType::XCCL || backendType == ProcessGroup::BackendType::UCC) { return getDefaultBackend()->getSequenceNumberForGroup(); } else { diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py index 18d1f418180f83..d0cae808fcaa43 100644 --- a/torch/distributed/distributed_c10d.py +++ b/torch/distributed/distributed_c10d.py @@ -341,7 +341,7 @@ def register_backend( Backend.backend_list.append(name.lower()) if devices is not None: for device in devices: - if device != "cpu" and device != "cuda" and device != "xpu": + if device != "cpu" and device != "cuda": Backend.default_device_backend_map[device] = name.lower() Backend.backend_type_map[name.lower()] = ProcessGroup.BackendType.CUSTOM @@ -1495,7 +1495,7 @@ def init_process_group( Args: backend (str or Backend, optional): The backend to use. Depending on - build-time configurations, valid values include ``mpi``, ``gloo``, ``xccl``, + build-time configurations, valid values include ``mpi``, ``gloo``, ``nccl``, and ``ucc``. If the backend is not provided, then both a ``gloo`` and ``nccl`` backend will be created, see notes below for how multiple backends are managed. This field can be given as a lowercase string @@ -1775,9 +1775,10 @@ def _new_process_group_helper( "created, please use a different group name" ) - if device_id is not None and device_id.index is None: + if device_id is not None and (device_id.index is None or device_id.type != "cuda"): raise ValueError( - "init_process_group device_id parameter must be a device with an index" + "init_process_group device_id parameter must be a cuda device with an " + "id, e.g. cuda:0, not just cuda or cpu" ) # Note: _new_process_group_helper is only called from init_process_group, which always provides a timeout value @@ -2730,6 +2731,7 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=None, async_op=False): else: work.wait() + @_exception_logger @deprecated( "`torch.distributed.all_reduce_coalesced` will be deprecated. If you must " @@ -4095,6 +4097,7 @@ def reduce_scatter_tensor(output, input, op=ReduceOp.SUM, group=None, async_op=F else: work.wait() + @deprecated( "`torch.distributed._reduce_scatter_base` is a private function and will be deprecated. " "Please use `torch.distributed.reduce_scatter_tensor` instead.",