Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add distributed backend (XCCL) #1105

Open
wants to merge 25 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Simply cmake logit
  • Loading branch information
Chao1Han committed Dec 13, 2024
commit d139548f85b304882697924402d8473cb55ef9bf
13 changes: 0 additions & 13 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,22 +39,9 @@ include(${TORCH_XPU_OPS_ROOT}/cmake/SYCL.cmake)
include(${TORCH_XPU_OPS_ROOT}/cmake/BuildFlags.cmake)

option(USE_XCCL "Build with XCCL support" ON)
gujinghui marked this conversation as resolved.
Show resolved Hide resolved
if (DEFINED ENV{USE_XCCL})
string(TOLOWER "$ENV{USE_XCCL}" USE_XCCL_LOWER)

if (NOT (USE_XCCL_LOWER STREQUAL "1" OR
USE_XCCL_LOWER STREQUAL "on" OR
USE_XCCL_LOWER STREQUAL "yes"))
set(USE_XCCL OFF CACHE BOOL "Build with XCCL support" FORCE)
else()
set(USE_XCCL ON CACHE BOOL "Build with XCCL support" FORCE)
endif()
endif()

if(NOT WIN32 AND USE_XCCL)
include(${TORCH_XPU_OPS_ROOT}/cmake/XCCL.cmake)
set(USE_C10D_XCCL ON)
set(USE_C10D_XCCL ${USE_C10D_XCCL} PARENT_SCOPE)
endif()
gujinghui marked this conversation as resolved.
Show resolved Hide resolved

if(BUILD_TEST)
Expand Down
11 changes: 4 additions & 7 deletions cmake/Modules/FindXCCL.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,9 @@ if (NOT EXISTS "${XCCL_ROOT}")
set(XCCL_ROOT $ENV{CCL_ROOT})
endif()

string(COMPARE EQUAL "${XCCL_ROOT}" "" nocclfound)
if(nocclfound)
if(NOT DEFINED $ENV{CCL_ROOT})
set(XCCL_FOUND False)
set(XCCL_REASON_FAILURE "OneCCL library not found!!")
set(XCCL_NOT_FOUND_MESSAGE "${XCCL_REASON_FAILURE}")
set(XCCL_NOT_FOUND_MESSAGE "OneCCL library not found!!")
return()
endif()

Expand Down Expand Up @@ -56,15 +54,14 @@ find_library(

if((NOT XCCL_INCLUDE_DIR) OR (NOT XCCL_LIBRARY_DIR) OR (NOT XCCL_LIBRARY))
set(XCCL_FOUND False)
set(XCCL_REASON_FAILURE "OneCCL library not found!!")
set(XCCL_NOT_FOUND_MESSAGE "${XCCL_REASON_FAILURE}")
set(XCCL_NOT_FOUND_MESSAGE "OneCCL library not found!!")
return()
endif()

find_package_handle_standard_args(
XCCL
FOUND_VAR XCCL_FOUND
REQUIRED_VARS XCCL_INCLUDE_DIR XCCL_LIBRARY_DIR XCCL_LIBRARY
REASON_FAILURE_MESSAGE "${XCCL_REASON_FAILURE}"
REASON_FAILURE_MESSAGE "${XCCL_NOT_FOUND_MESSAGE}"
)

2 changes: 2 additions & 0 deletions cmake/XCCL.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ if(NOT __XCCL_INCLUDED)
set_property(
TARGET torch::xccl PROPERTY INTERFACE_LINK_LIBRARIES
${XCCL_LIBRARY})
set(USE_C10D_XCCL ON)
set(USE_C10D_XCCL ${USE_C10D_XCCL} PARENT_SCOPE)
endif()
endif()
gujinghui marked this conversation as resolved.
Show resolved Hide resolved

5 changes: 4 additions & 1 deletion src/xccl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@ list(APPEND ATen_XPU_XCCL_SRCS ${xccl_cpp})

set(ATen_XPU_XCCL_SRCS ${ATen_XPU_XCCL_SRCS} PARENT_SCOPE)

# Copy the header file to the build directory so that the PyTorch registration file can locate it.
# Why copy the header file to the build directory?
# We want register XCCL backend to PyTorch c10d in torch/csrc/distributed/c10d/init.cpp#L27-L29.
# To align with other backends, we need to copy the header file to the build torch/csrc/distributed/c10d directory.
# Further solution is add find path for torch/csrc/distributed/c10d/init.cpp#L27-L29.
foreach(HEADER ${xccl_h})
gujinghui marked this conversation as resolved.
Show resolved Hide resolved
file(COPY ${HEADER} DESTINATION "${CMAKE_BINARY_DIR}/torch/csrc/distributed/c10d")
endforeach()
Loading