Skip to content

Commit

Permalink
Changes to support Perlmutter environment (#1360)
Browse files Browse the repository at this point in the history
* .

* remove deadcode

* add benchmarking mode, initializing weights randomly

* better logging when running out of memory

* update

---------

Co-authored-by: Gabriele Oliaro <[email protected]>
  • Loading branch information
goliaro and Gabriele Oliaro authored Apr 8, 2024
1 parent 1210256 commit b4a639c
Show file tree
Hide file tree
Showing 20 changed files with 159 additions and 150 deletions.
15 changes: 13 additions & 2 deletions cmake/cuda.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,19 @@ if(CUDA_FOUND)
# set cuda runtime and driver lib
# override cublas and curand because the FindCUDA module may not find the correct libs
set(CUDADRV_LIBRARIES ${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libcuda${LIBEXT})
set(CUDA_CUBLAS_LIBRARIES ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcublas${LIBEXT})
set(CUDA_curand_LIBRARY ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcurand${LIBEXT})
if(CUBLAS_PATH)
set(CUBLAS_ROOT ${CUBLAS_PATH})
else()
set(CUBLAS_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
endif()
set(CUDA_CUBLAS_LIBRARIES ${CUBLAS_ROOT}/lib64/libcublas${LIBEXT})
if(CURAND_PATH)
set(CURAND_ROOT ${CURAND_PATH})
else()
set(CURAND_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
endif()
set(CUDA_curand_LIBRARY ${CURAND_ROOT}/lib64/libcurand${LIBEXT})

list(APPEND FLEXFLOW_EXT_LIBRARIES
${CUDADRV_LIBRARIES}
${CUDA_CUBLAS_LIBRARIES}
Expand Down
12 changes: 11 additions & 1 deletion config/config.inc
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,16 @@ if [ -n "$CUDA_DIR" ]; then
SET_CUDA_LIB_PATH="CUDA_PATH=${CUDA_PATH}"
fi

# set cublas dir
if [ -n "$CUBLAS_DIR" ]; then
SET_CUBLAS="-DCUBLAS_PATH=${CUBLAS_DIR}"
fi

# set curand dir
if [ -n "$CURAND_DIR" ]; then
SET_CURAND="-DCURAND_PATH=${CURAND_DIR}"
fi

# set cudnn dir
if [ -n "$CUDNN_DIR" ]; then
SET_CUDNN="-DCUDNN_PATH=${CUDNN_DIR}"
Expand Down Expand Up @@ -231,7 +241,7 @@ if [ -n "$FF_GPU_BACKEND" ]; then
fi
fi

CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF -DLegion_HIJACK_CUDART=OFF ${SET_CC} ${SET_CXX} ${SET_INSTALL_DIR} ${SET_INFERENCE_TESTS} ${SET_LIBTORCH_PATH} ${SET_BUILD} ${SET_CUDA_ARCH} ${SET_CUDA} ${SET_CUDNN} ${SET_HIP_ARCH} ${SET_PYTHON} ${SET_BUILD_LEGION_ONLY} ${SET_NCCL} ${SET_NCCL_DIR} ${SET_LEGION_NETWORKS} ${SET_UCX} ${SET_EXAMPLES} ${SET_INFERENCE_EXAMPLES} ${SET_USE_PREBUILT_LEGION} ${SET_USE_PREBUILT_NCCL} ${SET_USE_ALL_PREBUILT_LIBRARIES} ${SET_BUILD_UNIT_TESTS} ${SET_AVX2} ${SET_MAX_DIM} ${SET_LEGION_MAX_RETURN_SIZE} ${SET_ROCM_PATH} ${SET_FF_GPU_BACKEND}"
CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF -DLegion_HIJACK_CUDART=OFF ${SET_CC} ${SET_CXX} ${SET_INSTALL_DIR} ${SET_INFERENCE_TESTS} ${SET_LIBTORCH_PATH} ${SET_BUILD} ${SET_CUDA_ARCH} ${SET_CUDA} ${SET_CUBLAS} ${SET_CURAND} ${SET_CUDNN} ${SET_HIP_ARCH} ${SET_PYTHON} ${SET_BUILD_LEGION_ONLY} ${SET_NCCL} ${SET_NCCL_DIR} ${SET_LEGION_NETWORKS} ${SET_UCX} ${SET_EXAMPLES} ${SET_INFERENCE_EXAMPLES} ${SET_USE_PREBUILT_LEGION} ${SET_USE_PREBUILT_NCCL} ${SET_USE_ALL_PREBUILT_LIBRARIES} ${SET_BUILD_UNIT_TESTS} ${SET_AVX2} ${SET_MAX_DIM} ${SET_LEGION_MAX_RETURN_SIZE} ${SET_ROCM_PATH} ${SET_FF_GPU_BACKEND}"

function run_cmake() {
SRC_LOCATION=${SRC_LOCATION:=`dirname $0`/../}
Expand Down
14 changes: 10 additions & 4 deletions config/config.linux
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,18 @@ FF_CUDA_ARCH=${FF_CUDA_ARCH:-"autodetect"}
# or all available architectures. TODO: support autodetect
FF_HIP_ARCH=${FF_HIP_ARCH:-"all"}

# set CUDNN dir in case cmake cannot autodetect a path
CUDNN_DIR=${CUDNN_DIR:-"/usr/local/cuda"}

# set CUDA dir in case cmake cannot autodetect a path
CUDA_DIR=${CUDA_DIR:-"/usr/local/cuda"}

# set CUBLAS dir in case it is not stored in the CUDA DIR
CUBLAS_DIR=${CUBLAS_DIR:-"/usr/local/cuda"}

# set CURAND dir in case it is not stored in the CUDA DIR
CURAND_DIR=${CURAND_DIR:-"/usr/local/cuda"}

# set CUDNN dir in case cmake cannot autodetect a path
CUDNN_DIR=${CUDNN_DIR:-"/usr/local/cuda"}

# if not use PREBUILD_NCCL, you can set NCCL_DIR to use external nccl lib,
# otherwise, we will build nccl from source
NCCL_DIR=${NCCL_DIR:-"/usr/local/cuda"}
Expand Down Expand Up @@ -102,7 +108,7 @@ fi

function get_build_configs() {
# Create a string with the values of the variables set in this script
BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDNN_DIR=${CUDNN_DIR} CUDA_DIR=${CUDA_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} UCX_DIR=${UCX_DIR} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND} INSTALL_DIR=${INSTALL_DIR}"
BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDA_DIR=${CUDA_DIR} CUDNN_DIR=${CUDNN_DIR} CUBLAS_DIR=${CUBLAS_DIR} CURAND_DIR=${CURAND_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} UCX_DIR=${UCX_DIR} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND} INSTALL_DIR=${INSTALL_DIR}"
}

if [[ -n "$1" && ( "$1" == "CMAKE_FLAGS" || "$1" == "CUDA_PATH" ) ]]; then
Expand Down
2 changes: 1 addition & 1 deletion include/flexflow/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ class FFConfig {
Legion::Runtime *lg_hlr;
Legion::IndexSpaceT<1> all_gpu_task_is;
// Legion::FieldSpace field_space;
bool syntheticInput, profiling, perform_fusion;
bool benchmarking, profiling, perform_fusion;
bool inference_debugging;
size_t simulator_work_space_size;
size_t search_budget;
Expand Down
4 changes: 3 additions & 1 deletion inference/incr_decoding/incr_decoding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,9 @@ void parse_input_args(char **argv,
}
}
if (paths.cache_folder_path.empty()) {
paths.cache_folder_path = "~/.cache/flexflow";
char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path)
: std::string("~/.cache/flexflow");
}
// Expand ~ to the home directory if needed
wordexp_t p;
Expand Down
20 changes: 0 additions & 20 deletions inference/models/falcon.cc
Original file line number Diff line number Diff line change
Expand Up @@ -252,26 +252,6 @@ void FALCON::create_falcon_model(FFModel &ff,

InferenceManager *im = InferenceManager::get_inference_manager();
im->register_model_weights_loader(&ff, fileloader);

#ifdef DEADCODE
// Compile the model
std::cout << "------start compile ----------" << std::endl;
InferenceManager *im = InferenceManager::get_inference_manager();
im->compile_model_and_allocate_buffer(&ff);
FileDataLoader fileloader("",
weight_file_path,
falcon_config.n_head,
falcon_config.n_head_kv,
falcon_config.hidden_size,
falcon_config.hidden_size / falcon_config.n_head,
ff.config.tensor_parallelism_degree);
std::cout << "------load weights ----------" << std::endl;
fileloader.load_weights(&ff, use_full_precision);
std::cout << "------load weight finished----------" << std::endl;

// init operators
im->init_operators_inference(&ff);
#endif
}

}; // namespace FlexFlow
10 changes: 0 additions & 10 deletions inference/models/llama.cc
Original file line number Diff line number Diff line change
Expand Up @@ -277,16 +277,6 @@ void LLAMA::create_llama_model(FFModel &ff,

InferenceManager *im = InferenceManager::get_inference_manager();
im->register_model_weights_loader(&ff, fileloader);
#ifdef DEADCODE
// Compile the model
std::cout << "------start compile ----------" << std::endl;
im->compile_model_and_allocate_buffer(&ff);
fileloader.load_weights(&ff);
std::cout << "------load weight finished----------" << std::endl;

// init operators
im->init_operators_inference(&ff);
#endif
}

}; // namespace FlexFlow
15 changes: 0 additions & 15 deletions inference/models/mpt.cc
Original file line number Diff line number Diff line change
Expand Up @@ -259,21 +259,6 @@ void MPT::create_mpt_model(FFModel &ff,

InferenceManager *im = InferenceManager::get_inference_manager();
im->register_model_weights_loader(&ff, fileloader);

#ifdef DEADCODE
//------------------- compile the model --------------------------------
InferenceManager *im = InferenceManager::get_inference_manager();
im->compile_model_and_allocate_buffer(&ff);
FileDataLoader fileloader("",
weight_file_path,
mpt_config.n_heads,
mpt_config.n_heads,
mpt_config.hidden_size,
mpt_config.hidden_size / mpt_config.n_heads,
ff.config.tensor_parallelism_degree);
fileloader.load_weights(&ff, use_full_precision);
im->init_operators_inference(&ff);
#endif
}

}; // namespace FlexFlow
18 changes: 0 additions & 18 deletions inference/models/opt.cc
Original file line number Diff line number Diff line change
Expand Up @@ -266,24 +266,6 @@ void OPT::create_opt_model(FFModel &ff,
use_full_precision);
InferenceManager *im = InferenceManager::get_inference_manager();
im->register_model_weights_loader(&ff, fileloader);

#ifdef DEADCODE
//------------------- compile the model --------------------------------
std::cout << "------start compile ----------" << std::endl;
InferenceManager *im = InferenceManager::get_inference_manager();
im->compile_model_and_allocate_buffer(&ff);
FileDataLoader fileloader("",
weight_file_path,
opt_config.num_attention_heads,
opt_config.num_attention_heads,
opt_config.hidden_size,
opt_config.hidden_size /
opt_config.num_attention_heads,
ff.config.tensor_parallelism_degree);
fileloader.load_weights(&ff, use_full_precision);
std::cout << "------finished loading weights----------" << std::endl;
im->init_operators_inference(&ff);
#endif
}

}; // namespace FlexFlow
10 changes: 0 additions & 10 deletions inference/models/starcoder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -232,16 +232,6 @@ void STARCODER::create_starcoder_model(
ff.config.tensor_parallelism_degree,
use_full_precision);
im->register_model_weights_loader(&ff, fileloader);
#ifdef DEADCODE
// Compile the model
std::cout << "------start compile ----------" << std::endl;
im->compile_model_and_allocate_buffer(&ff);
fileloader.load_weights(&ff, use_full_precision);
std::cout << "------load weight finished----------" << std::endl;

// init operators
im->init_operators_inference(&ff);
#endif
}

}; // namespace FlexFlow
3 changes: 2 additions & 1 deletion inference/python/incr_decoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,15 @@ def get_configs():
"use_4bit_quantization": False,
"use_8bit_quantization": False,
"profiling": False,
"benchmarking": False,
"inference_debugging": False,
"fusion": True,
}
llm_configs = {
# required parameters
"llm_model": "tiiuae/falcon-7b",
# optional parameters
"cache_path": "",
"cache_path": os.environ.get("FF_CACHE_PATH", ""),
"refresh_cache": False,
"full_precision": False,
"prompt": "",
Expand Down
3 changes: 2 additions & 1 deletion inference/python/spec_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,15 @@ def get_configs():
"use_4bit_quantization": False,
"use_8bit_quantization": False,
"profiling": False,
"benchmarking": False,
"inference_debugging": False,
"fusion": True,
}
llm_configs = {
# required llm arguments
"llm_model": "meta-llama/Llama-2-7b-hf",
# optional llm parameters
"cache_path": "",
"cache_path": os.environ.get("FF_CACHE_PATH", ""),
"refresh_cache": False,
"full_precision": False,
"ssms": [
Expand Down
4 changes: 3 additions & 1 deletion inference/spec_infer/spec_infer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,9 @@ void parse_input_args(char **argv,
}
}
if (paths.cache_folder_path.empty()) {
paths.cache_folder_path = "~/.cache/flexflow";
char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path)
: std::string("~/.cache/flexflow");
}
// Expand ~ to the home directory if needed
wordexp_t p;
Expand Down
4 changes: 2 additions & 2 deletions inference/utils/download_hf_model.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python
import flexflow.serve as ff
import argparse
import argparse, os


def parse_args():
Expand All @@ -12,7 +12,7 @@ def parse_args():
"--cache-folder",
type=str,
help="Folder to use to store the model(s) assets in FlexFlow format",
default="",
default=os.environ.get("FF_CACHE_PATH", ""),
)
parser.add_argument(
"--refresh-cache",
Expand Down
1 change: 1 addition & 0 deletions python/flexflow/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
"num_cpus": "-ll:cpu",
"legion_utility_processors": "-ll:util",
"profiling": "--profiling",
"benchmarking": "--benchmarking",
"inference_debugging": "--inference-debugging",
"fusion": "--fusion",
"disable_control_replication": "--disable-control-replication",
Expand Down
8 changes: 8 additions & 0 deletions python/flexflow/serve/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def init(
use_4bit_quantization: Optional[bool] = None,
use_8bit_quantization: Optional[bool] = None,
profiling: Optional[bool] = None,
benchmarking: Optional[bool] = None,
inference_debugging: Optional[bool] = None,
fusion: Optional[bool] = None,
):
Expand Down Expand Up @@ -72,6 +73,7 @@ def init(
- use_4bit_quantization: whether to use 4-bit quantization, defaults to False
- use_8bit_quantization: whether to use 8-bit quantization, defaults to False
- profiling: whether to enable the FlexFlow profiling mode, defaults to False
- benchmarking: whether to run benchmaking only, without loading real weights, defaults to False
- inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False
- fusion: whether to enable the FlexFlow operator fusion optimization, defaults to True
Expand Down Expand Up @@ -106,6 +108,8 @@ def init(
:type use_8bit_quantization: Optional[bool], optional
:param profiling: whether to enable the FlexFlow profiling mode, defaults to False
:type profiling: Optional[bool], optional
:param benchmarking: whether to run benchmaking only, without loading real weights, defaults to False
:type benchmarking: Optional[bool], optional
:param inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False
:type inference_debugging: Optional[bool], optional
:param fusion: whether to enable the FlexFlow operator fusion optimization, defaults to True
Expand All @@ -132,6 +136,7 @@ def init(
use_4bit_quantization is not None,
use_8bit_quantization is not None,
profiling is not None,
benchmarking is not None,
inference_debugging is not None,
fusion is not None,
]
Expand All @@ -157,6 +162,7 @@ def init(
"use_4bit_quantization": use_4bit_quantization,
"use_8bit_quantization": use_8bit_quantization,
"profiling": profiling,
"benchmarking": benchmarking,
"inference_debugging": inference_debugging,
"fusion": fusion,
}
Expand Down Expand Up @@ -201,6 +207,8 @@ def init(
configs_dict["use_8bit_quantization"] = False
if configs_dict.get("profiling", None) is None:
configs_dict["profiling"] = False
if configs_dict.get("benchmarking", None) is None:
configs_dict["benchmarking"] = False
if configs_dict.get("inference_debugging", None) is None:
configs_dict["inference_debugging"] = False
if configs_dict.get("fusion", None) is None:
Expand Down
Loading

0 comments on commit b4a639c

Please sign in to comment.