Changes to support Perlmutter environment (#1360)

* . * remove deadcode * add benchmarking mode, initializing weights randomly * better logging when running out of memory * update --------- Co-authored-by: Gabriele Oliaro <[email protected]>
flexflow · Apr 8, 2024 · b4a639c · b4a639c
1 parent 1210256
commit b4a639c
Show file tree

Hide file tree

Showing 20 changed files with 159 additions and 150 deletions.
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
@@ -13,8 +13,19 @@ if(CUDA_FOUND)
   # set cuda runtime and driver lib
   # override cublas and curand because the FindCUDA module may not find the correct libs  
   set(CUDADRV_LIBRARIES ${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libcuda${LIBEXT})
-  set(CUDA_CUBLAS_LIBRARIES ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcublas${LIBEXT})
-  set(CUDA_curand_LIBRARY ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcurand${LIBEXT})
+  if(CUBLAS_PATH)
+    set(CUBLAS_ROOT ${CUBLAS_PATH})
+  else()
+  set(CUBLAS_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
+  endif()
+  set(CUDA_CUBLAS_LIBRARIES ${CUBLAS_ROOT}/lib64/libcublas${LIBEXT})
+  if(CURAND_PATH)
+    set(CURAND_ROOT ${CURAND_PATH})
+  else()
+  set(CURAND_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
+  endif()
+  set(CUDA_curand_LIBRARY ${CURAND_ROOT}/lib64/libcurand${LIBEXT})
+
   list(APPEND FLEXFLOW_EXT_LIBRARIES
     ${CUDADRV_LIBRARIES}
     ${CUDA_CUBLAS_LIBRARIES}

diff --git a/config/config.inc b/config/config.inc
@@ -62,6 +62,16 @@ if [ -n "$CUDA_DIR" ]; then
   SET_CUDA_LIB_PATH="CUDA_PATH=${CUDA_PATH}"
 fi
 
+# set cublas dir
+if [ -n "$CUBLAS_DIR" ]; then
+  SET_CUBLAS="-DCUBLAS_PATH=${CUBLAS_DIR}"
+fi
+
+# set curand dir
+if [ -n "$CURAND_DIR" ]; then
+  SET_CURAND="-DCURAND_PATH=${CURAND_DIR}"
+fi
+
 # set cudnn dir
 if [ -n "$CUDNN_DIR" ]; then
   SET_CUDNN="-DCUDNN_PATH=${CUDNN_DIR}"
@@ -231,7 +241,7 @@ if [ -n "$FF_GPU_BACKEND" ]; then
   fi
 fi
 
-CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF -DLegion_HIJACK_CUDART=OFF ${SET_CC} ${SET_CXX} ${SET_INSTALL_DIR} ${SET_INFERENCE_TESTS} ${SET_LIBTORCH_PATH} ${SET_BUILD} ${SET_CUDA_ARCH} ${SET_CUDA} ${SET_CUDNN} ${SET_HIP_ARCH} ${SET_PYTHON} ${SET_BUILD_LEGION_ONLY} ${SET_NCCL} ${SET_NCCL_DIR} ${SET_LEGION_NETWORKS} ${SET_UCX} ${SET_EXAMPLES} ${SET_INFERENCE_EXAMPLES} ${SET_USE_PREBUILT_LEGION} ${SET_USE_PREBUILT_NCCL} ${SET_USE_ALL_PREBUILT_LIBRARIES} ${SET_BUILD_UNIT_TESTS} ${SET_AVX2} ${SET_MAX_DIM} ${SET_LEGION_MAX_RETURN_SIZE} ${SET_ROCM_PATH} ${SET_FF_GPU_BACKEND}"
+CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF -DLegion_HIJACK_CUDART=OFF ${SET_CC} ${SET_CXX} ${SET_INSTALL_DIR} ${SET_INFERENCE_TESTS} ${SET_LIBTORCH_PATH} ${SET_BUILD} ${SET_CUDA_ARCH} ${SET_CUDA} ${SET_CUBLAS} ${SET_CURAND} ${SET_CUDNN} ${SET_HIP_ARCH} ${SET_PYTHON} ${SET_BUILD_LEGION_ONLY} ${SET_NCCL} ${SET_NCCL_DIR} ${SET_LEGION_NETWORKS} ${SET_UCX} ${SET_EXAMPLES} ${SET_INFERENCE_EXAMPLES} ${SET_USE_PREBUILT_LEGION} ${SET_USE_PREBUILT_NCCL} ${SET_USE_ALL_PREBUILT_LIBRARIES} ${SET_BUILD_UNIT_TESTS} ${SET_AVX2} ${SET_MAX_DIM} ${SET_LEGION_MAX_RETURN_SIZE} ${SET_ROCM_PATH} ${SET_FF_GPU_BACKEND}"
 
 function run_cmake() {
 SRC_LOCATION=${SRC_LOCATION:=`dirname $0`/../}

diff --git a/config/config.linux b/config/config.linux
@@ -36,12 +36,18 @@ FF_CUDA_ARCH=${FF_CUDA_ARCH:-"autodetect"}
 # or all available architectures. TODO: support autodetect
 FF_HIP_ARCH=${FF_HIP_ARCH:-"all"}
 
-# set CUDNN dir in case cmake cannot autodetect a path
-CUDNN_DIR=${CUDNN_DIR:-"/usr/local/cuda"}
-
 # set CUDA dir in case cmake cannot autodetect a path
 CUDA_DIR=${CUDA_DIR:-"/usr/local/cuda"}
 
+# set CUBLAS dir in case it is not stored in the CUDA DIR
+CUBLAS_DIR=${CUBLAS_DIR:-"/usr/local/cuda"}
+
+# set CURAND dir in case it is not stored in the CUDA DIR
+CURAND_DIR=${CURAND_DIR:-"/usr/local/cuda"}
+
+# set CUDNN dir in case cmake cannot autodetect a path
+CUDNN_DIR=${CUDNN_DIR:-"/usr/local/cuda"}
+
 # if not use PREBUILD_NCCL, you can set NCCL_DIR to use external nccl lib,
 # otherwise, we will build nccl from source
 NCCL_DIR=${NCCL_DIR:-"/usr/local/cuda"}
@@ -102,7 +108,7 @@ fi
 
 function get_build_configs() {
     # Create a string with the values of the variables set in this script
-    BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDNN_DIR=${CUDNN_DIR} CUDA_DIR=${CUDA_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} UCX_DIR=${UCX_DIR} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND} INSTALL_DIR=${INSTALL_DIR}"
+    BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDA_DIR=${CUDA_DIR} CUDNN_DIR=${CUDNN_DIR} CUBLAS_DIR=${CUBLAS_DIR} CURAND_DIR=${CURAND_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} UCX_DIR=${UCX_DIR} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND} INSTALL_DIR=${INSTALL_DIR}"
 }
 
 if [[ -n "$1" && ( "$1" == "CMAKE_FLAGS" || "$1" == "CUDA_PATH" ) ]]; then

diff --git a/include/flexflow/config.h b/include/flexflow/config.h
@@ -145,7 +145,7 @@ class FFConfig {
   Legion::Runtime *lg_hlr;
   Legion::IndexSpaceT<1> all_gpu_task_is;
   // Legion::FieldSpace field_space;
-  bool syntheticInput, profiling, perform_fusion;
+  bool benchmarking, profiling, perform_fusion;
   bool inference_debugging;
   size_t simulator_work_space_size;
   size_t search_budget;

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
@@ -107,7 +107,9 @@ void parse_input_args(char **argv,
     }
   }
   if (paths.cache_folder_path.empty()) {
-    paths.cache_folder_path = "~/.cache/flexflow";
+    char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
+    paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path)
+                                            : std::string("~/.cache/flexflow");
   }
   // Expand ~ to the home directory if needed
   wordexp_t p;

diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc
@@ -252,26 +252,6 @@ void FALCON::create_falcon_model(FFModel &ff,
 
   InferenceManager *im = InferenceManager::get_inference_manager();
   im->register_model_weights_loader(&ff, fileloader);
-
-#ifdef DEADCODE
-  // Compile the model
-  std::cout << "------start compile ----------" << std::endl;
-  InferenceManager *im = InferenceManager::get_inference_manager();
-  im->compile_model_and_allocate_buffer(&ff);
-  FileDataLoader fileloader("",
-                            weight_file_path,
-                            falcon_config.n_head,
-                            falcon_config.n_head_kv,
-                            falcon_config.hidden_size,
-                            falcon_config.hidden_size / falcon_config.n_head,
-                            ff.config.tensor_parallelism_degree);
-  std::cout << "------load weights ----------" << std::endl;
-  fileloader.load_weights(&ff, use_full_precision);
-  std::cout << "------load weight finished----------" << std::endl;
-
-  // init operators
-  im->init_operators_inference(&ff);
-#endif
 }
 
 }; // namespace FlexFlow
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
@@ -277,16 +277,6 @@ void LLAMA::create_llama_model(FFModel &ff,
 
   InferenceManager *im = InferenceManager::get_inference_manager();
   im->register_model_weights_loader(&ff, fileloader);
-#ifdef DEADCODE
-  // Compile the model
-  std::cout << "------start compile ----------" << std::endl;
-  im->compile_model_and_allocate_buffer(&ff);
-  fileloader.load_weights(&ff);
-  std::cout << "------load weight finished----------" << std::endl;
-
-  // init operators
-  im->init_operators_inference(&ff);
-#endif
 }
 
 }; // namespace FlexFlow
diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc
@@ -259,21 +259,6 @@ void MPT::create_mpt_model(FFModel &ff,
 
   InferenceManager *im = InferenceManager::get_inference_manager();
   im->register_model_weights_loader(&ff, fileloader);
-
-#ifdef DEADCODE
-  //------------------- compile the model --------------------------------
-  InferenceManager *im = InferenceManager::get_inference_manager();
-  im->compile_model_and_allocate_buffer(&ff);
-  FileDataLoader fileloader("",
-                            weight_file_path,
-                            mpt_config.n_heads,
-                            mpt_config.n_heads,
-                            mpt_config.hidden_size,
-                            mpt_config.hidden_size / mpt_config.n_heads,
-                            ff.config.tensor_parallelism_degree);
-  fileloader.load_weights(&ff, use_full_precision);
-  im->init_operators_inference(&ff);
-#endif
 }
 
 }; // namespace FlexFlow
diff --git a/inference/models/opt.cc b/inference/models/opt.cc
@@ -266,24 +266,6 @@ void OPT::create_opt_model(FFModel &ff,
       use_full_precision);
   InferenceManager *im = InferenceManager::get_inference_manager();
   im->register_model_weights_loader(&ff, fileloader);
-
-#ifdef DEADCODE
-  //------------------- compile the model --------------------------------
-  std::cout << "------start compile ----------" << std::endl;
-  InferenceManager *im = InferenceManager::get_inference_manager();
-  im->compile_model_and_allocate_buffer(&ff);
-  FileDataLoader fileloader("",
-                            weight_file_path,
-                            opt_config.num_attention_heads,
-                            opt_config.num_attention_heads,
-                            opt_config.hidden_size,
-                            opt_config.hidden_size /
-                                opt_config.num_attention_heads,
-                            ff.config.tensor_parallelism_degree);
-  fileloader.load_weights(&ff, use_full_precision);
-  std::cout << "------finished loading weights----------" << std::endl;
-  im->init_operators_inference(&ff);
-#endif
 }
 
 }; // namespace FlexFlow
diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc
@@ -232,16 +232,6 @@ void STARCODER::create_starcoder_model(
       ff.config.tensor_parallelism_degree,
       use_full_precision);
   im->register_model_weights_loader(&ff, fileloader);
-#ifdef DEADCODE
-  // Compile the model
-  std::cout << "------start compile ----------" << std::endl;
-  im->compile_model_and_allocate_buffer(&ff);
-  fileloader.load_weights(&ff, use_full_precision);
-  std::cout << "------load weight finished----------" << std::endl;
-
-  // init operators
-  im->init_operators_inference(&ff);
-#endif
 }
 
 }; // namespace FlexFlow
diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py
@@ -55,14 +55,15 @@ def get_configs():
             "use_4bit_quantization": False,
             "use_8bit_quantization": False,
             "profiling": False,
+            "benchmarking": False,
             "inference_debugging": False,
             "fusion": True,
         }
         llm_configs = {
             # required parameters
             "llm_model": "tiiuae/falcon-7b",
             # optional parameters
-            "cache_path": "",
+            "cache_path": os.environ.get("FF_CACHE_PATH", ""),
             "refresh_cache": False,
             "full_precision": False,
             "prompt": "",

diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py
@@ -55,14 +55,15 @@ def get_configs():
             "use_4bit_quantization": False,
             "use_8bit_quantization": False,
             "profiling": False,
+            "benchmarking": False,
             "inference_debugging": False,
             "fusion": True,
         }
         llm_configs = {
             # required llm arguments
             "llm_model": "meta-llama/Llama-2-7b-hf",
             # optional llm parameters
-            "cache_path": "",
+            "cache_path": os.environ.get("FF_CACHE_PATH", ""),
             "refresh_cache": False,
             "full_precision": False,
             "ssms": [

diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
@@ -124,7 +124,9 @@ void parse_input_args(char **argv,
     }
   }
   if (paths.cache_folder_path.empty()) {
-    paths.cache_folder_path = "~/.cache/flexflow";
+    char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
+    paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path)
+                                            : std::string("~/.cache/flexflow");
   }
   // Expand ~ to the home directory if needed
   wordexp_t p;

diff --git a/inference/utils/download_hf_model.py b/inference/utils/download_hf_model.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 import flexflow.serve as ff
-import argparse
+import argparse, os
 
 
 def parse_args():
@@ -12,7 +12,7 @@ def parse_args():
         "--cache-folder",
         type=str,
         help="Folder to use to store the model(s) assets in FlexFlow format",
-        default="",
+        default=os.environ.get("FF_CACHE_PATH", ""),
     )
     parser.add_argument(
         "--refresh-cache",

diff --git a/python/flexflow/core/__init__.py b/python/flexflow/core/__init__.py
@@ -41,6 +41,7 @@
     "num_cpus": "-ll:cpu",
     "legion_utility_processors": "-ll:util",
     "profiling": "--profiling",
+    "benchmarking": "--benchmarking",
     "inference_debugging": "--inference-debugging",
     "fusion": "--fusion",
     "disable_control_replication": "--disable-control-replication",

diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py
@@ -45,6 +45,7 @@ def init(
     use_4bit_quantization: Optional[bool] = None,
     use_8bit_quantization: Optional[bool] = None,
     profiling: Optional[bool] = None,
+    benchmarking: Optional[bool] = None,
     inference_debugging: Optional[bool] = None,
     fusion: Optional[bool] = None,
 ):
@@ -72,6 +73,7 @@ def init(
     - use_4bit_quantization: whether to use 4-bit quantization, defaults to False
     - use_8bit_quantization: whether to use 8-bit quantization, defaults to False
     - profiling: whether to enable the FlexFlow profiling mode, defaults to False
+    - benchmarking: whether to run benchmaking only, without loading real weights, defaults to False
     - inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False
     - fusion: whether to enable the FlexFlow operator fusion optimization, defaults to True
 
@@ -106,6 +108,8 @@ def init(
     :type use_8bit_quantization: Optional[bool], optional
     :param profiling: whether to enable the FlexFlow profiling mode, defaults to False
     :type profiling: Optional[bool], optional
+    :param benchmarking: whether to run benchmaking only, without loading real weights, defaults to False
+    :type benchmarking: Optional[bool], optional
     :param inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False
     :type inference_debugging: Optional[bool], optional
     :param fusion: whether to enable the FlexFlow operator fusion optimization, defaults to True
@@ -132,6 +136,7 @@ def init(
             use_4bit_quantization is not None,
             use_8bit_quantization is not None,
             profiling is not None,
+            benchmarking is not None,
             inference_debugging is not None,
             fusion is not None,
         ]
@@ -157,6 +162,7 @@ def init(
             "use_4bit_quantization": use_4bit_quantization,
             "use_8bit_quantization": use_8bit_quantization,
             "profiling": profiling,
+            "benchmarking": benchmarking,
             "inference_debugging": inference_debugging,
             "fusion": fusion,
         }
@@ -201,6 +207,8 @@ def init(
         configs_dict["use_8bit_quantization"] = False
     if configs_dict.get("profiling", None) is None:
         configs_dict["profiling"] = False
+    if configs_dict.get("benchmarking", None) is None:
+        configs_dict["benchmarking"] = False
     if configs_dict.get("inference_debugging", None) is None:
         configs_dict["inference_debugging"] = False
     if configs_dict.get("fusion", None) is None: