Skip to content

Commit

Permalink
update nccl
Browse files Browse the repository at this point in the history
  • Loading branch information
goliaro committed Sep 21, 2024
1 parent a0f1ed7 commit 5dbd18b
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 138 deletions.
198 changes: 71 additions & 127 deletions cmake/nccl.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -2,140 +2,84 @@ set(NCCL_NAME nccl)
# set(NCCL_CUDA_ARCH "-gencode=arch=compute_${CUDA_ARCH},code=sm_${CUDA_ARCH}")
# message("NCCL_CUDA_ARCH: ${NCCL_CUDA_ARCH}")

set(NCCL_URL "")
if((FF_USE_PREBUILT_NCCL OR FF_USE_ALL_PREBUILT_LIBRARIES) AND CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64")
if(LINUX_VERSION MATCHES "20.04")
if (CUDA_VERSION VERSION_EQUAL "11.0")
set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.0.3.tar.gz")
elseif(CUDA_VERSION VERSION_EQUAL "11.1")
set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.1.1.tar.gz")
elseif(CUDA_VERSION VERSION_EQUAL "11.2")
set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.2.2.tar.gz")
elseif(CUDA_VERSION VERSION_EQUAL "11.3")
set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.3.1.tar.gz")
elseif(CUDA_VERSION VERSION_EQUAL "11.4")
set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.4.3.tar.gz")
elseif(CUDA_VERSION VERSION_EQUAL "11.5")
set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.5.2.tar.gz")
elseif(CUDA_VERSION VERSION_EQUAL "11.6")
set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.6.2.tar.gz")
elseif(CUDA_VERSION VERSION_EQUAL "11.7")
set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.7.0.tar.gz")
endif()
elseif(LINUX_VERSION MATCHES "18.04")
if (CUDA_VERSION VERSION_EQUAL "10.1")
set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_10.1.243.tar.gz")
elseif (CUDA_VERSION VERSION_EQUAL "10.2")
set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_10.2.89.tar.gz")
elseif (CUDA_VERSION VERSION_EQUAL "11.0")
set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.0.3.tar.gz")
elseif(CUDA_VERSION VERSION_EQUAL "11.1")
set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.1.1.tar.gz")
elseif(CUDA_VERSION VERSION_EQUAL "11.2")
set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.2.2.tar.gz")
elseif(CUDA_VERSION VERSION_EQUAL "11.3")
set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.3.1.tar.gz")
elseif(CUDA_VERSION VERSION_EQUAL "11.4")
set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.4.3.tar.gz")
elseif(CUDA_VERSION VERSION_EQUAL "11.5")
set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.5.2.tar.gz")
elseif(CUDA_VERSION VERSION_EQUAL "11.6")
set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.6.2.tar.gz")
elseif(CUDA_VERSION VERSION_EQUAL "11.7")
set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.7.0.tar.gz")
endif()
endif()
if(NCCL_PATH)
set(NCCL_ROOT ${NCCL_PATH})
else()
# if NCCL_PATH is not set, let's try to find it in the CUDA root
set(NCCL_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
endif()

if(NCCL_URL)
# Download and import pre-compiled NCCL library
message(STATUS "Using pre-compiled NCCL library")
message(STATUS "NCCL_URL: ${NCCL_URL}")
find_library(NCCL_LIBRARY
NAMES libnccl${LIBEXT}
PATHS ${NCCL_ROOT} ${CUDA_ROOT}
PATH_SUFFIXES lib lib64
DOC "NCCL library." )

include(FetchContent)
FetchContent_Declare(${NCCL_NAME}
URL ${NCCL_URL}
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
)
FetchContent_GetProperties(${NCCL_NAME})
if(NOT ${NCCL_NAME}_POPULATED)
FetchContent_Populate(${NCCL_NAME})
endif()

set(NCCL_FOLDER_PATH ${${NCCL_NAME}_SOURCE_DIR}/deps/${NCCL_NAME})
set(NCCL_INCLUDE_DIR ${NCCL_FOLDER_PATH}/include)
set(NCCL_LIB_DIR ${NCCL_FOLDER_PATH}/lib)
message(STATUS "NCCL library path: ${NCCL_FOLDER_PATH}")
add_library(nccl SHARED IMPORTED)
set_target_properties(nccl PROPERTIES IMPORTED_LOCATION ${NCCL_FOLDER_PATH})
find_path(NCCL_INCLUDE_DIR
NAMES nccl.h
HINTS ${NCCL_ROOT}
PATH_SUFFIXES include
DOC "NCCL include directory.")

list(APPEND FLEXFLOW_INCLUDE_DIRS ${NCCL_INCLUDE_DIR})
list(APPEND FLEXFLOW_EXT_LIBRARIES ${NCCL_LIB_DIR}/libnccl${LIBEXT})
install(DIRECTORY ${NCCL_INCLUDE_DIR}/ DESTINATION include)
install(DIRECTORY ${NCCL_LIB_DIR}/ DESTINATION lib PATTERN "pkgconfig" EXCLUDE)

else()
if(NCCL_PATH)
set(NCCL_ROOT ${NCCL_PATH})
# find NCCL, set NCCL lib and include
if(NCCL_LIBRARY AND NCCL_INCLUDE_DIR)
set(NCCL_FOUND ON)
set(NCCL_LIBRARIES ${NCCL_LIBRARY})
set(NCCL_INCLUDE_DIRS ${NCCL_INCLUDE_DIR})

# Check NCCL version
if(EXISTS "${NCCL_INCLUDE_DIR}/nccl.h")
file(STRINGS "${NCCL_INCLUDE_DIR}/nccl.h" NCCL_VERSION_DEFINES
REGEX "#define NCCL_MAJOR [0-9]+" )
file(STRINGS "${NCCL_INCLUDE_DIR}/nccl.h" NCCL_VERSION_DEFINES2
REGEX "#define NCCL_MINOR [0-9]+" )
string(REGEX MATCH "([0-9]+)" NCCL_MAJOR ${NCCL_VERSION_DEFINES})
string(REGEX MATCH "([0-9]+)" NCCL_MINOR ${NCCL_VERSION_DEFINES2})
set(NCCL_VERSION "${NCCL_MAJOR}.${NCCL_MINOR}")
if(NCCL_VERSION VERSION_LESS 2.23)
set(NCCL_OLD TRUE)
else()
set(NCCL_OLD FALSE)
endif()
message(STATUS "Found NCCL version: ${NCCL_VERSION}")
else()
# if NCCL_PATH is not set, let's try to find it in the CUDA root
set(NCCL_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
message(WARNING "NCCL header not found, unable to determine version")
set(NCCL_OLD TRUE) # Assume old version if we can't determine
endif()

find_library(NCCL_LIBRARY
NAMES libnccl${LIBEXT}
PATHS ${NCCL_ROOT} ${CUDA_ROOT}
PATH_SUFFIXES lib lib64
DOC "NCCL library." )
endif()

find_path(NCCL_INCLUDE_DIR
NAMES nccl.h
HINTS ${NCCL_ROOT}
PATH_SUFFIXES include
DOC "NCCL include directory.")

# find NCCL, set NCCL lib and include
if(NCCL_LIBRARY AND NCCL_INCLUDE_DIR)
set(NCCL_FOUND ON)
set(NCCL_LIBRARIES ${NCCL_LIBRARY})
set(NCCL_INCLUDE_DIRS ${NCCL_INCLUDE_DIR})
endif()

# find NCCL
if(NCCL_FOUND)
list(APPEND FLEXFLOW_EXT_LIBRARIES ${NCCL_LIBRARIES})
list(APPEND FLEXFLOW_INCLUDE_DIRS ${NCCL_INCLUDE_DIRS})
message( STATUS "NCCL include : ${NCCL_INCLUDE_DIRS}" )
message( STATUS "NCCL libraries : ${NCCL_LIBRARIES}" )
add_library(nccl SHARED IMPORTED)

# Build NCCL from source
else()
message(STATUS "Building NCCL from source")
list(TRANSFORM CUDA_GENCODE PREPEND "NVCC_GENCODE=" OUTPUT_VARIABLE NCCL_BUILD_NVCC_GENCODE)

ExternalProject_Add(${NCCL_NAME}
SOURCE_DIR ${PROJECT_SOURCE_DIR}/deps/${NCCL_NAME}
PREFIX ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}
INSTALL_DIR ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}
BUILD_BYPRODUCTS ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/libnccl${LIBEXT}
INSTALL_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND make src.build "${NCCL_BUILD_NVCC_GENCODE}" "CUDA_HOME=${CUDA_TOOLKIT_ROOT_DIR}" "BUILDDIR=${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}"
BUILD_IN_SOURCE 1
)
# find NCCL
if(NCCL_FOUND AND (NOT NCCL_OLD OR CUDA_VERSION VERSION_LESS 12.0))
list(APPEND FLEXFLOW_EXT_LIBRARIES ${NCCL_LIBRARIES})
list(APPEND FLEXFLOW_INCLUDE_DIRS ${NCCL_INCLUDE_DIRS})
message( STATUS "NCCL include : ${NCCL_INCLUDE_DIRS}" )
message( STATUS "NCCL libraries : ${NCCL_LIBRARIES}" )
add_library(nccl SHARED IMPORTED)

ExternalProject_Get_Property(${NCCL_NAME} INSTALL_DIR)
message(STATUS "NCCL install dir: ${INSTALL_DIR}")
list(APPEND FLEXFLOW_INCLUDE_DIRS
${INSTALL_DIR}/include)
list(APPEND FLEXFLOW_EXT_LIBRARIES
${INSTALL_DIR}/lib/libnccl${LIBEXT})
set_directory_properties(PROPERTIES ADDITIONAL_CLEAN_FILES "${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/")

install(DIRECTORY ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/include/ DESTINATION include)
install(DIRECTORY ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/ DESTINATION lib PATTERN "pkgconfig" EXCLUDE)
endif()
# Build NCCL from source
else()
message(STATUS "Building NCCL from source")
list(TRANSFORM CUDA_GENCODE PREPEND "NVCC_GENCODE=" OUTPUT_VARIABLE NCCL_BUILD_NVCC_GENCODE)

ExternalProject_Add(${NCCL_NAME}
SOURCE_DIR ${PROJECT_SOURCE_DIR}/deps/${NCCL_NAME}
PREFIX ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}
INSTALL_DIR ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}
BUILD_BYPRODUCTS ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/libnccl${LIBEXT}
INSTALL_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND make src.build "${NCCL_BUILD_NVCC_GENCODE}" "CUDA_HOME=${CUDA_TOOLKIT_ROOT_DIR}" "BUILDDIR=${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}"
BUILD_IN_SOURCE 1
)

ExternalProject_Get_Property(${NCCL_NAME} INSTALL_DIR)
message(STATUS "NCCL install dir: ${INSTALL_DIR}")
list(APPEND FLEXFLOW_INCLUDE_DIRS
${INSTALL_DIR}/include)
list(APPEND FLEXFLOW_EXT_LIBRARIES
${INSTALL_DIR}/lib/libnccl${LIBEXT})
set_directory_properties(PROPERTIES ADDITIONAL_CLEAN_FILES "${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/")

install(DIRECTORY ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/include/ DESTINATION include)
install(DIRECTORY ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/ DESTINATION lib PATTERN "pkgconfig" EXCLUDE)
endif()
2 changes: 1 addition & 1 deletion deps/nccl
Submodule nccl updated 188 files
2 changes: 1 addition & 1 deletion docker/flexflow-environment/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ LABEL org.opencontainers.image.source=https://github.com/flexflow/FlexFlow
LABEL org.opencontainers.image.description="FlexFlow environment container"

# Install basic dependencies
RUN apt-get update && apt-get install -y --no-install-recommends wget sudo binutils git zlib1g-dev lsb-release nano gdb libhdf5-dev jq && \
RUN apt-get update && apt-get install -y --no-install-recommends wget sudo binutils git zlib1g-dev lsb-release nano gdb libhdf5-dev jq nvtop && \
rm -rf /var/lib/apt/lists/* /etc/apt/sources.list.d/cuda.list /etc/apt/sources.list.d/nvidia-ml.list && \
apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends software-properties-common && \
apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends build-essential apt-utils \
Expand Down
12 changes: 6 additions & 6 deletions docker/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@ ATTACH_GPUS=${ATTACH_GPUS:-true}
gpu_arg=""
if $ATTACH_GPUS ; then gpu_arg="--gpus all" ; fi

# Whether to attach inference weights / files (make sure to download the weights first)
ATTACH_INFERENCE_FILES=${ATTACH_INFERENCE_FILES:-false}

# Amount of shared memory to give the Docker container access to
# If you get a Bus Error, increase this value. If you don't have enough memory
Expand Down Expand Up @@ -115,9 +113,11 @@ if [[ "$(docker images -q "${image}-${FF_GPU_BACKEND}${gpu_backend_version}":lat
exit 1
fi

inference_volumes=""
if $ATTACH_INFERENCE_FILES ; then
inference_volumes="-v ~/.cache/flexflow:/usr/FlexFlow/inference";
hf_token_volume=""
hf_token_path="$HOME/.cache/huggingface/token"
if [ -f "$hf_token_path" ]; then
# If the token exists, add the volume mount to the Docker command
hf_token_volume+="-v $hf_token_path:/root/.cache/huggingface/token"
fi

eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "${inference_volumes}" "${image}-${FF_GPU_BACKEND}${gpu_backend_version}:latest"
eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "${hf_token_volume}" "${image}-${FF_GPU_BACKEND}${gpu_backend_version}:latest"
3 changes: 0 additions & 3 deletions tests/inference_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,6 @@ fi
# Clean up before test (just in case)
cleanup

# Make sure supported version of protobuf is installed
pip3 install protobuf==3.20.3

# Create test prompt file
mkdir -p ../inference/prompt
echo '["Three tips for staying healthy are: "]' > ../inference/prompt/test.json
Expand Down

0 comments on commit 5dbd18b

Please sign in to comment.