Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add distributed backend (XCCL) #1105

Open
wants to merge 21 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@ list(APPEND CMAKE_MODULE_PATH ${TORCH_XPU_OPS_ROOT}/cmake/Modules)
include(${TORCH_XPU_OPS_ROOT}/cmake/SYCL.cmake)
include(${TORCH_XPU_OPS_ROOT}/cmake/BuildFlags.cmake)

option(USE_XCCL "Build with XCCL support" ON)
gujinghui marked this conversation as resolved.
Show resolved Hide resolved

if(NOT WIN32 AND USE_XCCL)
include(${TORCH_XPU_OPS_ROOT}/cmake/XCCL.cmake)
endif()
gujinghui marked this conversation as resolved.
Show resolved Hide resolved

if(BUILD_TEST)
add_subdirectory(${TORCH_XPU_OPS_ROOT}/test/sycl ${CMAKE_BINARY_DIR}/test_sycl)
endif()
Expand Down
62 changes: 62 additions & 0 deletions cmake/Modules/FindXCCL.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# This will define the following variables:
# XCCL_FOUND : True if the system has the XCCL library.
# XCCL_INCLUDE_DIR : Include directories needed to use XCCL.
# XCCL_LIBRARY_DIR :The path to the XCCL library.
# XCCL_LIBRARY : XCCL library fullname.

include(${CMAKE_ROOT}/Modules/FindPackageHandleStandardArgs.cmake)

# we need source OneCCL environment before building.
set(XCCL_ROOT $ENV{CCL_ROOT})

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How do you get CCL_ROOT? I think you cannot assume it will be set after oneccl source.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It will auto set after source oneccl env, and I remember oneccl update not affect this flag.


# Find include path from binary.
find_file(
XCCL_INCLUDE_DIR
NAMES include
HINTS ${XCCL_ROOT}
NO_DEFAULT_PATH
)

# Find include/oneapi path from include path.
find_file(
XCCL_INCLUDE_ONEAPI_DIR
NAMES oneapi
HINTS ${XCCL_ROOT}/include/
NO_DEFAULT_PATH
)

list(APPEND XCCL_INCLUDE_DIR ${XCCL_INCLUDE_ONEAPI_DIR})

# Find library directory from binary.
find_file(
XCCL_LIBRARY_DIR
NAMES lib
HINTS ${XCCL_ROOT}
NO_DEFAULT_PATH
)

# Find XCCL library fullname.
find_library(
XCCL_LIBRARY
NAMES ccl
HINTS ${XCCL_LIBRARY_DIR}
NO_DEFAULT_PATH
)

if((NOT XCCL_INCLUDE_DIR) OR (NOT XCCL_LIBRARY_DIR) OR (NOT XCCL_LIBRARY))
set(XCCL_FOUND False)
set(XCCL_NOT_FOUND_MESSAGE "OneCCL library not found!!")
return()
endif()

SET(CMAKE_INCLUDE_PATH ${CMAKE_INCLUDE_PATH}
"${XCCL_INCLUDE_DIR}")
SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH}
"${XCCL_LIBRARY_DIR}")

find_package_handle_standard_args(
XCCL
FOUND_VAR XCCL_FOUND
REQUIRED_VARS XCCL_INCLUDE_DIR XCCL_LIBRARY_DIR XCCL_LIBRARY
REASON_FAILURE_MESSAGE "${XCCL_NOT_FOUND_MESSAGE}"
)
21 changes: 21 additions & 0 deletions cmake/XCCL.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
if(NOT __XCCL_INCLUDED)
set(__XCCL_INCLUDED TRUE)

# XCCL_ROOT, XCCL_LIBRARY_DIR, XCCL_INCLUDE_DIR are handled by FindXCCL.cmake.
find_package(XCCL REQUIRED)
if(NOT XCCL_FOUND)
message("${XCCL_NOT_FOUND_MESSAGE}")
return()
endif()
if(XCCL_FOUND)
add_library(torch::xccl INTERFACE IMPORTED)
set_property(
TARGET torch::xccl PROPERTY INTERFACE_INCLUDE_DIRECTORIES
${XCCL_INCLUDE_DIR})
set_property(
TARGET torch::xccl PROPERTY INTERFACE_LINK_LIBRARIES
${XCCL_LIBRARY})
set(USE_C10D_XCCL ON)
set(USE_C10D_XCCL ${USE_C10D_XCCL} PARENT_SCOPE)
endif()
endif()
gujinghui marked this conversation as resolved.
Show resolved Hide resolved
8 changes: 7 additions & 1 deletion src/BuildOnLinux.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,13 @@ add_library(
STATIC
${ATen_XPU_CPP_SRCS}
${ATen_XPU_NATIVE_CPP_SRCS}
${ATen_XPU_GEN_SRCS})
${ATen_XPU_GEN_SRCS}
${ATen_XPU_XCCL_SRCS})

if(USE_C10D_XCCL)
target_compile_definitions(torch_xpu_ops PRIVATE USE_C10D_XCCL)
target_link_libraries(torch_xpu_ops PUBLIC torch::xccl)
endif()

if(BUILD_SEPARATE_OPS)
foreach(sycl_src ${ATen_XPU_SYCL_SRCS})
Expand Down
5 changes: 4 additions & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,14 @@ include(${TORCH_XPU_OPS_ROOT}/cmake/Codegen.cmake)
set(ATen_XPU_CPP_SRCS)
set(ATen_XPU_NATIVE_CPP_SRCS)
set(ATen_XPU_SYCL_SRCS)
set(ATen_XPU_XCCL_SRCS)

set(ATen_XPU_INCLUDE_DIRS ${TORCH_XPU_OPS_ROOT}/src CACHE STRING "ATen XPU Include directory")

add_subdirectory(ATen)

if(USE_C10D_XCCL)
add_subdirectory(xccl)
endif()
# With the increasement of bin size, we have to split libtorch_xpu.so into
# multiple libraries. Because of strict linkage requirements on Windows,
# we add extra logics to resolve, 1) Cyclic dependence, 2) Make symbols visible.
Expand Down
16 changes: 16 additions & 0 deletions src/xccl/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# XCCL sources

file(GLOB xccl_h "*.hpp")
file(GLOB xccl_cpp "*.cpp")

list(APPEND ATen_XPU_XCCL_SRCS ${xccl_cpp})

set(ATen_XPU_XCCL_SRCS ${ATen_XPU_XCCL_SRCS} PARENT_SCOPE)

# Why copy the header file to the build directory?
# We want register XCCL backend to PyTorch c10d in torch/csrc/distributed/c10d/init.cpp#L27-L29.
# To align with other backends, we need to copy the header file to the build torch/csrc/distributed/c10d directory.
# Further solution is add find path for torch/csrc/distributed/c10d/init.cpp#L27-L29.
foreach(HEADER ${xccl_h})
gujinghui marked this conversation as resolved.
Show resolved Hide resolved
file(COPY ${HEADER} DESTINATION "${CMAKE_BINARY_DIR}/torch/csrc/distributed/c10d")
endforeach()
Loading
Loading