From e796bc9b92c63a8b1fb9c315b27f2d8e4dec1f6a Mon Sep 17 00:00:00 2001 From: Wei Wu Date: Tue, 25 Jul 2023 23:27:29 -0700 Subject: [PATCH 1/6] fix for ucx --- CMakeLists.txt | 10 +--------- cmake/legion.cmake | 2 ++ 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e7504d7026..42f448f854 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -137,8 +137,7 @@ if ((FF_LEGION_NETWORKS STREQUAL "gasnet" AND FF_GASNET_CONDUIT STREQUAL "ucx") endif() if (FF_LEGION_NETWORKS STREQUAL "ucx") - set(ucx_DIR ${UCX_DIR}/cmake) - set(ENV{Legion_NETWORKS} "ucx") + set(ucx_ROOT ${UCX_DIR}/cmake) message(STATUS "Legion_NETWORKS: $ENV{Legion_NETWORKS}") endif() else() @@ -179,13 +178,6 @@ set(CC_FLAGS $ENV{CC_FLAGS}) set(NVCC_FLAGS $ENV{NVCC_FLAGS}) set(LD_FLAGS $ENV{LD_FLAGS}) -# Set global FLAGS -list(APPEND CC_FLAGS - -std=c++11) - -list(APPEND NVCC_FLAGS - -std=c++11) - add_compile_options(${CC_FLAGS}) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${NVCC_FLAGS}) link_libraries(${LD_FLAGS}) diff --git a/cmake/legion.cmake b/cmake/legion.cmake index b4cfad20e2..a5e6a24f66 100644 --- a/cmake/legion.cmake +++ b/cmake/legion.cmake @@ -132,6 +132,8 @@ else() set(Legion_EMBED_GASNet_VERSION "GASNet-2022.3.0" CACHE STRING "GASNet version") set(Legion_NETWORKS "gasnetex" CACHE STRING "GASNet conduit") set(GASNet_CONDUIT ${FF_GASNET_CONDUIT}) + elseif("${FF_LEGION_NETWORKS}" STREQUAL "ucx") + set(Legion_NETWORKS "ucx" CACHE STRING "Enable UCX") endif() message(STATUS "GASNET ROOT: $ENV{GASNet_ROOT_DIR}") set(Legion_MAX_DIM ${FF_MAX_DIM} CACHE STRING "Maximum number of dimensions") From 4def3b30295d19aae9ebcc402f6f796681197c50 Mon Sep 17 00:00:00 2001 From: Wei Wu Date: Tue, 8 Aug 2023 22:44:06 -0700 Subject: [PATCH 2/6] fixup --- CMakeLists.txt | 100 +++----------------------------------------- cmake/legion.cmake | 2 + cmake/nccl.cmake | 3 +- cmake/ucx.cmake | 80 +++++++++++++++++++++++++++++++++++ config/config.inc | 7 +++- config/config.linux | 14 +++---- 6 files changed, 102 insertions(+), 104 deletions(-) create mode 100644 cmake/ucx.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 42f448f854..2043336806 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,105 +43,15 @@ option(FF_USE_PREBUILT_NCCL "Enable use of NCCL pre-compiled library, if availab option(FF_USE_PREBUILT_LEGION "Enable use of Legion pre-compiled library, if available" ON) option(FF_USE_ALL_PREBUILT_LIBRARIES "Enable use of all pre-compiled libraries, if available" OFF) -# option for using Python -set(FF_GASNET_CONDUITS aries udp mpi ibv ucx) +# option for using network +set(FF_GASNET_CONDUITS aries udp mpi ibv) set(FF_GASNET_CONDUIT "mpi" CACHE STRING "Select GASNet conduit ${FF_GASNET_CONDUITS}") set_property(CACHE FF_GASNET_CONDUIT PROPERTY STRINGS ${FF_GASNET_CONDUITS}) set(FF_LEGION_NETWORKS "" CACHE STRING "Network backend(s) to use") -if ((FF_LEGION_NETWORKS STREQUAL "gasnet" AND FF_GASNET_CONDUIT STREQUAL "ucx") OR FF_LEGION_NETWORKS STREQUAL "ucx") - if("${FF_UCX_URL}" STREQUAL "") - set(UCX_URL "https://github.com/openucx/ucx/releases/download/v1.14.0-rc1/ucx-1.14.0.tar.gz") - else() - set(UCX_URL "${FF_UCX_URL}") - endif() - - set(UCX_DIR ${CMAKE_CURRENT_BINARY_DIR}/ucx) - get_filename_component(UCX_COMPRESSED_FILE_NAME "${UCX_URL}" NAME) - # message(STATUS "UCX_URL: ${UCX_URL}") - # message(STATUS "UCX_COMPRESSED_FILE_NAME: ${UCX_COMPRESSED_FILE_NAME}") - set(UCX_COMPRESSED_FILE_PATH "${CMAKE_CURRENT_BINARY_DIR}/${UCX_COMPRESSED_FILE_NAME}") - set(UCX_BUILD_NEEDED OFF) - set(UCX_CONFIG_FILE ${UCX_DIR}/config.txt) - set(UCX_BUILD_OUTPUT ${UCX_DIR}/build.log) - - if(EXISTS ${UCX_CONFIG_FILE}) - file(READ ${UCX_CONFIG_FILE} PREV_UCX_CONFIG) - # message(STATUS "PREV_UCX_CONFIG: ${PREV_UCX_CONFIG}") - if("${UCX_URL}" STREQUAL "${PREV_UCX_CONFIG}") - # configs match - no build needed - set(UCX_BUILD_NEEDED OFF) - else() - message(STATUS "UCX configuration has changed - rebuilding...") - set(UCX_BUILD_NEEDED ON) - endif() - else() - message(STATUS "Configuring and building UCX...") - set(UCX_BUILD_NEEDED ON) - endif() - - if(UCX_BUILD_NEEDED) - if(NOT EXISTS "${UCX_COMPRESSED_FILE_PATH}") - message(STATUS "Downloading openucx/ucx from: ${UCX_URL}") - file( - DOWNLOAD - "${UCX_URL}" "${UCX_COMPRESSED_FILE_PATH}" - SHOW_PROGRESS - STATUS status - LOG log - ) - - list(GET status 0 status_code) - list(GET status 1 status_string) - - if(status_code EQUAL 0) - message(STATUS "Downloading... done") - else() - message(FATAL_ERROR "error: downloading '${UCX_URL}' failed - status_code: ${status_code} - status_string: ${status_string} - log: - --- LOG BEGIN --- - ${log} - --- LOG END ---" - ) - endif() - else() - message(STATUS "${UCX_COMPRESSED_FILE_NAME} already exists") - endif() - - execute_process(COMMAND mkdir -p ${UCX_DIR}) - execute_process(COMMAND tar xzf ${UCX_COMPRESSED_FILE_PATH} -C ${UCX_DIR} --strip-components 1) - message(STATUS "Building UCX...") - execute_process( - COMMAND sh -c "cd ${UCX_DIR} && ${UCX_DIR}/contrib/configure-release --prefix=${UCX_DIR}/install --enable-mt && make -j8 && make install" - RESULT_VARIABLE UCX_BUILD_STATUS - OUTPUT_FILE ${UCX_BUILD_OUTPUT} - ERROR_FILE ${UCX_BUILD_OUTPUT} - ) - - if(UCX_BUILD_STATUS) - message(FATAL_ERROR "UCX build result = ${UCX_BUILD_STATUS} - see ${UCX_BUILD_OUTPUT} for more details") - endif() - - # Currently, we use default build configurations for UCX and therefore only save URL as configuration settings - file(WRITE ${UCX_CONFIG_FILE} "${UCX_URL}") - endif() - - if (FF_LEGION_NETWORKS STREQUAL "gasnet" AND FF_GASNET_CONDUIT STREQUAL "ucx") - set(ENV{UCX_HOME} "${UCX_DIR}/install") - install(DIRECTORY ${UCX_DIR}/install/bin/ DESTINATION bin) - install(DIRECTORY ${UCX_DIR}/install/include/ DESTINATION include) - install(DIRECTORY ${UCX_DIR}/install/lib/ DESTINATION lib) - install(DIRECTORY ${UCX_DIR}/install/share/ DESTINATION share) - endif() - - if (FF_LEGION_NETWORKS STREQUAL "ucx") - set(ucx_ROOT ${UCX_DIR}/cmake) - message(STATUS "Legion_NETWORKS: $ENV{Legion_NETWORKS}") - endif() -else() - message(STATUS "FF_GASNET_CONDUIT: ${FF_GASNET_CONDUIT}") +message(STATUS "FF_LEGION_NETWORKS: ${FF_LEGION_NETWORKS}") +if (FF_LEGION_NETWORKS STREQUAL "gasnet") + message(STATUS "FF_GASNET_CONDUIT: ${FF_GASNET_CONDUIT}") endif() set(FF_GPU_BACKENDS cuda hip_cuda hip_rocm intel) diff --git a/cmake/legion.cmake b/cmake/legion.cmake index a5e6a24f66..1c37d59744 100644 --- a/cmake/legion.cmake +++ b/cmake/legion.cmake @@ -133,6 +133,8 @@ else() set(Legion_NETWORKS "gasnetex" CACHE STRING "GASNet conduit") set(GASNet_CONDUIT ${FF_GASNET_CONDUIT}) elseif("${FF_LEGION_NETWORKS}" STREQUAL "ucx") + set(ucx_ROOT ${UCX_PATH}/lib/cmake) + message(STATUS "Find ucx: ${UCX_PATH}") set(Legion_NETWORKS "ucx" CACHE STRING "Enable UCX") endif() message(STATUS "GASNET ROOT: $ENV{GASNet_ROOT_DIR}") diff --git a/cmake/nccl.cmake b/cmake/nccl.cmake index 04a23dcb8a..c140a44ec8 100644 --- a/cmake/nccl.cmake +++ b/cmake/nccl.cmake @@ -109,8 +109,9 @@ else() message( STATUS "NCCL include : ${NCCL_INCLUDE_DIRS}" ) message( STATUS "NCCL libraries : ${NCCL_LIBRARIES}" ) add_library(nccl SHARED IMPORTED) + + # Build NCCL from source else() - # Build NCCL from source message(STATUS "Building NCCL from source") list(TRANSFORM CUDA_GENCODE PREPEND "NVCC_GENCODE=" OUTPUT_VARIABLE NCCL_BUILD_NVCC_GENCODE) diff --git a/cmake/ucx.cmake b/cmake/ucx.cmake new file mode 100644 index 0000000000..a530741e4c --- /dev/null +++ b/cmake/ucx.cmake @@ -0,0 +1,80 @@ +if(UCX_PATH) + set(ucx_ROOT ${UCX_PATH}/cmake) + message(STATUS "FF_LEGION_NETWORKS: ${FF_LEGION_NETWORKS}") + message(STATUS "Find ucx: ${UCX_PATH}") +else() + set(UCX_URL "https://github.com/openucx/ucx/releases/download/v1.14.0-rc1/ucx-1.14.0.tar.gz") + + set(UCX_DIR ${CMAKE_CURRENT_BINARY_DIR}/ucx) + get_filename_component(UCX_COMPRESSED_FILE_NAME "${UCX_URL}" NAME) + # message(STATUS "UCX_URL: ${UCX_URL}") + # message(STATUS "UCX_COMPRESSED_FILE_NAME: ${UCX_COMPRESSED_FILE_NAME}") + set(UCX_COMPRESSED_FILE_PATH "${CMAKE_CURRENT_BINARY_DIR}/${UCX_COMPRESSED_FILE_NAME}") + set(UCX_BUILD_NEEDED OFF) + set(UCX_CONFIG_FILE ${UCX_DIR}/config.txt) + set(UCX_BUILD_OUTPUT ${UCX_DIR}/build.log) + + if(EXISTS ${UCX_CONFIG_FILE}) + file(READ ${UCX_CONFIG_FILE} PREV_UCX_CONFIG) + # message(STATUS "PREV_UCX_CONFIG: ${PREV_UCX_CONFIG}") + if("${UCX_URL}" STREQUAL "${PREV_UCX_CONFIG}") + # configs match - no build needed + set(UCX_BUILD_NEEDED OFF) + else() + message(STATUS "UCX configuration has changed - rebuilding...") + set(UCX_BUILD_NEEDED ON) + endif() + else() + message(STATUS "Configuring and building UCX...") + set(UCX_BUILD_NEEDED ON) + endif() + + if(UCX_BUILD_NEEDED) + if(NOT EXISTS "${UCX_COMPRESSED_FILE_PATH}") + message(STATUS "Downloading openucx/ucx from: ${UCX_URL}") + file( + DOWNLOAD + "${UCX_URL}" "${UCX_COMPRESSED_FILE_PATH}" + SHOW_PROGRESS + STATUS status + LOG log + ) + + list(GET status 0 status_code) + list(GET status 1 status_string) + + if(status_code EQUAL 0) + message(STATUS "Downloading... done") + else() + message(FATAL_ERROR "error: downloading '${UCX_URL}' failed + status_code: ${status_code} + status_string: ${status_string} + log: + --- LOG BEGIN --- + ${log} + --- LOG END ---" + ) + endif() + else() + message(STATUS "${UCX_COMPRESSED_FILE_NAME} already exists") + endif() + + execute_process(COMMAND mkdir -p ${UCX_DIR}) + execute_process(COMMAND tar xzf ${UCX_COMPRESSED_FILE_PATH} -C ${UCX_DIR} --strip-components 1) + message(STATUS "Building UCX...") + execute_process( + COMMAND sh -c "cd ${UCX_DIR} && ${UCX_DIR}/contrib/configure-release --prefix=${UCX_DIR}/install --enable-mt && make -j8 && make install" + RESULT_VARIABLE UCX_BUILD_STATUS + OUTPUT_FILE ${UCX_BUILD_OUTPUT} + ERROR_FILE ${UCX_BUILD_OUTPUT} + ) + + if(UCX_BUILD_STATUS) + message(FATAL_ERROR "UCX build result = ${UCX_BUILD_STATUS} - see ${UCX_BUILD_OUTPUT} for more details") + endif() + + # Currently, we use default build configurations for UCX and therefore only save URL as configuration settings + file(WRITE ${UCX_CONFIG_FILE} "${UCX_URL}") + endif() + +endif() \ No newline at end of file diff --git a/config/config.inc b/config/config.inc index ebc6b9cb49..9f65d32ce4 100644 --- a/config/config.inc +++ b/config/config.inc @@ -49,6 +49,11 @@ if [ -n "$CUDNN_DIR" ]; then SET_CUDNN="-DCUDNN_PATH=${CUDNN_DIR}" fi +# set ucx dir +if [ -n "$UCX_DIR" ]; then + SET_UCX="-DUCX_PATH=${UCX_DIR}" +fi + # enable Python if [ "$FF_USE_PYTHON" = "ON" ]; then SET_PYTHON="-DFF_USE_PYTHON=ON" @@ -188,7 +193,7 @@ if [ -n "$FF_GPU_BACKEND" ]; then fi fi -CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF -DLegion_HIJACK_CUDART=OFF ${SET_CC} ${SET_CXX} ${SET_INSTALL_DIR} ${SET_BUILD} ${SET_CUDA_ARCH} ${SET_CUDA} ${SET_CUDNN} ${SET_PYTHON} ${SET_NCCL} ${SET_NCCL_DIR} ${SET_LEGION_NETWORKS} ${SET_EXAMPLES} ${SET_USE_PREBUILT_LEGION} ${SET_USE_PREBUILT_NCCL} ${SET_USE_ALL_PREBUILT_LIBRARIES} ${SET_BUILD_UNIT_TESTS} ${SET_AVX2} ${SET_MAX_DIM} ${SET_ROCM_PATH} ${SET_FF_GPU_BACKEND}" +CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF -DLegion_HIJACK_CUDART=OFF ${SET_CC} ${SET_CXX} ${SET_INSTALL_DIR} ${SET_BUILD} ${SET_CUDA_ARCH} ${SET_CUDA} ${SET_CUDNN} ${SET_UCX} ${SET_PYTHON} ${SET_NCCL} ${SET_NCCL_DIR} ${SET_LEGION_NETWORKS} ${SET_EXAMPLES} ${SET_USE_PREBUILT_LEGION} ${SET_USE_PREBUILT_NCCL} ${SET_USE_ALL_PREBUILT_LIBRARIES} ${SET_BUILD_UNIT_TESTS} ${SET_AVX2} ${SET_MAX_DIM} ${SET_ROCM_PATH} ${SET_FF_GPU_BACKEND}" function run_cmake() { SRC_LOCATION=${SRC_LOCATION:=`dirname $0`/../} diff --git a/config/config.linux b/config/config.linux index 90cf722453..b958aba0b3 100755 --- a/config/config.linux +++ b/config/config.linux @@ -40,8 +40,8 @@ FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS:-} # select GASNET conduit FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ibv} -# set UCX URL -FF_UCX_URL=${FF_UCX_URL:-""} +# set UCX dir if Legion networks is set to ucx +UCX_DIR=${UCX_DIR:-""} # build C++ examples FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES:-OFF} @@ -52,6 +52,11 @@ FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS:-OFF} # use precompiled NCCL and Legion libraries, where available FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL:-OFF} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION:-OFF} + +# if not use PREBUILD_NCCL, you can set NCCL_DIR to use external nccl lib, +# otherwise, we will build nccl from source +NCCL_DIR=${NCCL_DIR:-""} + # use the flag below to use both the NCCL and Legion pre-built libraries. # when the flag below is set to ON, the two flags above are ignored. FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES:-OFF} @@ -77,11 +82,6 @@ else FF_USE_NCCL=OFF fi -function get_build_configs() { - # Create a string with the values of the variables set in this script - BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} CUDNN_DIR=${CUDNN_DIR} CUDA_DIR=${CUDA_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} FF_UCX_URL=${FF_UCX_URL} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND}" -} - if [ -n "$1" ]; then if [ "$1" != "get-docker-configs" ]; then . $(dirname $0)/config.inc From fdeb31fb5280f7662d1780930a398a4703645ce7 Mon Sep 17 00:00:00 2001 From: Wei Wu Date: Tue, 8 Aug 2023 22:56:19 -0700 Subject: [PATCH 3/6] fixup --- cmake/ucx.cmake | 80 ------------------------------------------------- 1 file changed, 80 deletions(-) delete mode 100644 cmake/ucx.cmake diff --git a/cmake/ucx.cmake b/cmake/ucx.cmake deleted file mode 100644 index a530741e4c..0000000000 --- a/cmake/ucx.cmake +++ /dev/null @@ -1,80 +0,0 @@ -if(UCX_PATH) - set(ucx_ROOT ${UCX_PATH}/cmake) - message(STATUS "FF_LEGION_NETWORKS: ${FF_LEGION_NETWORKS}") - message(STATUS "Find ucx: ${UCX_PATH}") -else() - set(UCX_URL "https://github.com/openucx/ucx/releases/download/v1.14.0-rc1/ucx-1.14.0.tar.gz") - - set(UCX_DIR ${CMAKE_CURRENT_BINARY_DIR}/ucx) - get_filename_component(UCX_COMPRESSED_FILE_NAME "${UCX_URL}" NAME) - # message(STATUS "UCX_URL: ${UCX_URL}") - # message(STATUS "UCX_COMPRESSED_FILE_NAME: ${UCX_COMPRESSED_FILE_NAME}") - set(UCX_COMPRESSED_FILE_PATH "${CMAKE_CURRENT_BINARY_DIR}/${UCX_COMPRESSED_FILE_NAME}") - set(UCX_BUILD_NEEDED OFF) - set(UCX_CONFIG_FILE ${UCX_DIR}/config.txt) - set(UCX_BUILD_OUTPUT ${UCX_DIR}/build.log) - - if(EXISTS ${UCX_CONFIG_FILE}) - file(READ ${UCX_CONFIG_FILE} PREV_UCX_CONFIG) - # message(STATUS "PREV_UCX_CONFIG: ${PREV_UCX_CONFIG}") - if("${UCX_URL}" STREQUAL "${PREV_UCX_CONFIG}") - # configs match - no build needed - set(UCX_BUILD_NEEDED OFF) - else() - message(STATUS "UCX configuration has changed - rebuilding...") - set(UCX_BUILD_NEEDED ON) - endif() - else() - message(STATUS "Configuring and building UCX...") - set(UCX_BUILD_NEEDED ON) - endif() - - if(UCX_BUILD_NEEDED) - if(NOT EXISTS "${UCX_COMPRESSED_FILE_PATH}") - message(STATUS "Downloading openucx/ucx from: ${UCX_URL}") - file( - DOWNLOAD - "${UCX_URL}" "${UCX_COMPRESSED_FILE_PATH}" - SHOW_PROGRESS - STATUS status - LOG log - ) - - list(GET status 0 status_code) - list(GET status 1 status_string) - - if(status_code EQUAL 0) - message(STATUS "Downloading... done") - else() - message(FATAL_ERROR "error: downloading '${UCX_URL}' failed - status_code: ${status_code} - status_string: ${status_string} - log: - --- LOG BEGIN --- - ${log} - --- LOG END ---" - ) - endif() - else() - message(STATUS "${UCX_COMPRESSED_FILE_NAME} already exists") - endif() - - execute_process(COMMAND mkdir -p ${UCX_DIR}) - execute_process(COMMAND tar xzf ${UCX_COMPRESSED_FILE_PATH} -C ${UCX_DIR} --strip-components 1) - message(STATUS "Building UCX...") - execute_process( - COMMAND sh -c "cd ${UCX_DIR} && ${UCX_DIR}/contrib/configure-release --prefix=${UCX_DIR}/install --enable-mt && make -j8 && make install" - RESULT_VARIABLE UCX_BUILD_STATUS - OUTPUT_FILE ${UCX_BUILD_OUTPUT} - ERROR_FILE ${UCX_BUILD_OUTPUT} - ) - - if(UCX_BUILD_STATUS) - message(FATAL_ERROR "UCX build result = ${UCX_BUILD_STATUS} - see ${UCX_BUILD_OUTPUT} for more details") - endif() - - # Currently, we use default build configurations for UCX and therefore only save URL as configuration settings - file(WRITE ${UCX_CONFIG_FILE} "${UCX_URL}") - endif() - -endif() \ No newline at end of file From d7c8dc95f3d965b1856129aa1f8b78ad77c273bf Mon Sep 17 00:00:00 2001 From: Wei Wu Date: Wed, 9 Aug 2023 06:52:56 -0700 Subject: [PATCH 4/6] fixup --- CMakeLists.txt | 3 +++ config/config.linux | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2043336806..6f24b7ddd6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,6 +22,9 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) STRING "Choose the type of build." FORCE) endif() +# set std 11 +set (CMAKE_CXX_STANDARD 11) + # do not disable assertions even if in release mode set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -UNDEBUG") diff --git a/config/config.linux b/config/config.linux index 54fd553621..cfc73b9523 100755 --- a/config/config.linux +++ b/config/config.linux @@ -82,6 +82,11 @@ else FF_USE_NCCL=OFF fi +function get_build_configs() { + # Create a string with the values of the variables set in this script + BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} CUDNN_DIR=${CUDNN_DIR} CUDA_DIR=${CUDA_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} FF_UCX_URL=${FF_UCX_URL} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND}" +} + if [[ -n "$1" && ( "$1" == "CMAKE_FLAGS" || "$1" == "CUDA_PATH" ) ]]; then . $(dirname $0)/config.inc # Passing CMAKE_FLAGS or CUDA_PATH as $1 will print the value of the CMAKE_FLAGS/CUDA_PATH variable, From d17f6263df6b5b14ceda532fad7ccd2150a96a26 Mon Sep 17 00:00:00 2001 From: vincent163 Date: Fri, 3 Nov 2023 23:45:12 +0800 Subject: [PATCH 5/6] Add documentation for multi node installation using UCX (#1221) Co-authored-by: vincent-163 --- MULTI-NODE.md | 55 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 48 insertions(+), 7 deletions(-) diff --git a/MULTI-NODE.md b/MULTI-NODE.md index a8fd2fb705..00dfdbffc3 100644 --- a/MULTI-NODE.md +++ b/MULTI-NODE.md @@ -17,15 +17,33 @@ Source: Custom (use the security group ID) You can also use your own GPU cluster, as long as all machines are interconnected with a low-latency network. -## 2. Configure and build FlexFlow +## 2. Configure and build UCX -Follow steps 1 to 5 in [INSTALL.md](INSTALL.md) to download the source code, install system dependencies, install the Python dependencies, configure the FlexFlow build, and build FlexFlow **on each instance at the same path**. +Find the latest source code release for UCX at https://github.com/openucx/ucx/releases. As of writing this documentation, the latest UCX was 1.15.0 at https://github.com/openucx/ucx/releases/download/v1.15.0/ucx-1.15.0.tar.gz. Extract it and switch to the directory with UCX source code, and run: + +``` +CUDA_PATH=/usr/local/cuda +PREFIX=$PWD/install +./contrib/configure-release-mt --prefix="$PREFIX" --without-go --enable-mt --with-cuda="$CUDA_PATH" +make -j install +echo "$PREFIX" +``` + +Replace `{{ CUDA_PATH }}` with the path of your CUDA installation. If you don't know the path, try `which nvcc`. Take note of the path of UCX installation, echoed as part of the last command. + +## 3. Configure and build FlexFlow + +Follow steps 1 to 5 in [INSTALL.md](INSTALL.md#1-download-the-source-code) to download the source code, install system dependencies, install the Python dependencies, configure the FlexFlow build, and build FlexFlow **on each instance at the same path**. Or you can use NFS to mount home directory of each instance so that only a single build is necessary. You can skip step 2 (Install system dependencies) if you have spun up instances with Deep Learning AMI, which comes preconfigured with CUDA. Otherwise, you need to install system dependencies on each instance. -For step 4 (Configuring the FlexFlow build), make sure to specify a network using the `FF_LEGION_NETWORKS` parameter. We recommend using `FF_LEGION_NETWORKS=gasnet` and `FF_GASNET_CONDUIT=ucx`. Other configurations are optional. +For step 4 (Configuring the FlexFlow build), here are the parameters that need to be configured: +* Set `FF_LEGION_NETWORKS=ucx` +* Set `UCX_DIR` to the UCX installation path mentioned in [Configure and build UCX](#2-configure-and-build-ucx) + +Other configuration options are optional. -## 3. Configure MPI +## 4. Configure MPI MPI is an easy way to launch FlexFlow across all instances simultaneously and set up communication between them. @@ -64,8 +82,31 @@ ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOy5NKYdE8Cwgid59rx6xMqyj9vLaWuXIwy/BSRiK4su 5. Test MPI by running `mpirun -N 1 --hostfile ~/hostfile hostname`. It should display the hostname of all your nodes. If you encounter any errors like `WARNING: Open MPI accepted a TCP connection from what appears to be another Open MPI process but cannot find a corresponding process entry for that peer.`, add the parameter `--mca btl_tcp_if_include` in the `mpirun` command (refer to [this Stack Overflow question](https://stackoverflow.com/questions/15072563/running-mpi-on-two-hosts)). -## 4. Test FlexFlow +## 5. Test FlexFlow -Follow step 6 in [INSTALL.md](INSTALL.md) to set environment variables. +Follow step 6 in [INSTALL.md](INSTALL.md#6-test-flexflow) to set environment variables. + +Save the following script as `mnist_mlp_run.sh` and make sure to change `FLEXFLOW_DIR` and `UCX_DIR` to appropriate paths: + +```bash +#!/bin/bash +eval "$(conda shell.bash hook)" +conda activate flexflow +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib + +# Path to your FlexFlow build +FLEXFLOW_DIR=/home/ubuntu/FlexFlow/build + +# Path to your UCX installation +UCX_DIR=/home/ubuntu/ucx-1.15.0/install + +export REALM_UCP_BOOTSTRAP_PLUGIN=$FLEXFLOW_DIR/deps/legion/lib/realm_ucp_bootstrap_mpi.so +export LD_LIBRARY_PATH=$FLEXFLOW_DIR/deps/legion/lib:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=$FLEXFLOW_DIR:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=$UCX_DIR/lib:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=/opt/conda/envs/flexflow/lib:$LD_LIBRARY_PATH + +mpiexec -x REALM_UCP_BOOTSTRAP_PLUGIN -x PATH -x LD_LIBRARY_PATH --hostfile ~/hostfile --mca btl_tcp_if_include ens5 -np 2 "$FLEXFLOW_DIR"/flexflow_python "$FLEXFLOW_DIR"/../examples/python/native/mnist_mlp.py -ll:py 1 -ll:gpu 1 -ll:fsize 8000 -ll:zsize 8000 +``` -A script to run a Python example on multiple nodes is available at `scripts/mnist_mlp_run.sh`. You can run the script using [`mpirun`](https://www.open-mpi.org/doc/current/man1/mpirun.1.php) (if you configured it in step 3) or [`srun`](https://slurm.schedmd.com/srun.html). \ No newline at end of file +Run the script to test FlexFlow on mnist mlp training. You can adjust the script to run any other program. \ No newline at end of file From 258ae7dc60a445490c598776547463c1710e45ad Mon Sep 17 00:00:00 2001 From: Wei Wu Date: Wed, 15 Nov 2023 16:35:39 -0800 Subject: [PATCH 6/6] minor fix --- config/config.linux | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/config/config.linux b/config/config.linux index cfc73b9523..73badb190e 100755 --- a/config/config.linux +++ b/config/config.linux @@ -28,7 +28,8 @@ CUDNN_DIR=${CUDNN_DIR:-"/usr/local/cuda"} # set CUDA dir in case cmake cannot autodetect a path CUDA_DIR=${CUDA_DIR:-"/usr/local/cuda"} -#set NCCL dir +# if not use PREBUILD_NCCL, you can set NCCL_DIR to use external nccl lib, +# otherwise, we will build nccl from source NCCL_DIR=${NCCL_DIR:-"/usr/local/cuda"} # enable Python @@ -53,10 +54,6 @@ FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS:-OFF} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL:-OFF} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION:-OFF} -# if not use PREBUILD_NCCL, you can set NCCL_DIR to use external nccl lib, -# otherwise, we will build nccl from source -NCCL_DIR=${NCCL_DIR:-""} - # use the flag below to use both the NCCL and Legion pre-built libraries. # when the flag below is set to ON, the two flags above are ignored. FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES:-OFF}