Fix inference tests in CI (flexflow#1225)

* updated diffs in tests * manually add BOS token in LLAMA * shellcheck * fix
xinhaoc · Nov 6, 2023 · b0fe522 · b0fe522
1 parent bd305f7
commit b0fe522
Show file tree

Hide file tree

Showing 14 changed files with 112 additions and 97 deletions.
diff --git a/.github/README.md b/.github/README.md
@@ -153,7 +153,7 @@ A C++ example is available at [this folder](../inference/spec_infer/). After bui
 * `-ll:fsize`: size of device memory on each GPU in MB
 * `-ll:zsize`: size of zero-copy memory (pinned DRAM with direct GPU access) in MB. FlexFlow Serve keeps a replica of the LLM parameters on zero-copy memory, and therefore requires that the zero-copy memory is sufficient for storing the LLM parameters.
 * `-llm-model`: the LLM model ID from HuggingFace (e.g. "meta-llama/Llama-2-7b-hf")
-* `-ssm-model`: the SSM model ID from HuggingFace (e.g. "JackFram/llama-160m-base"). You can use multiple `-ssm-model`s in the command line to launch multiple SSMs.
+* `-ssm-model`: the SSM model ID from HuggingFace (e.g. "JackFram/llama-160m"). You can use multiple `-ssm-model`s in the command line to launch multiple SSMs.
 * `-cache-folder`: the folder
 * `-data-parallelism-degree`, `-tensor-parallelism-degree` and `-pipeline-parallelism-degree`: parallelization degrees in the data, tensor, and pipeline dimensions. Their product must equal the number of GPUs available on the machine. When any of the three parallelism degree arguments is omitted, a default value of 1 will be used. 
 * `-prompt`: (optional) path to the prompt file. FlexFlow Serve expects a json format file for prompts. In addition, users can also use the following API for registering requests:
@@ -193,13 +193,13 @@ Below is a list of models that we have explicitly tested and for which a SSM may
 
 | Model | Model id on HuggingFace | Boost-tuned SSMs |
 | :---- | :---- | :---- |
-| LLaMA-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) |
-| LLaMA-13B | decapoda-research/llama-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) |
-| LLaMA-30B | decapoda-research/llama-30b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) |
-| LLaMA-65B | decapoda-research/llama-65b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) |
-| LLaMA-2-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) |
-| LLaMA-2-13B | meta-llama/Llama-2-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) |
-| LLaMA-2-70B | meta-llama/Llama-2-70b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) |
+| LLaMA-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-13B | decapoda-research/llama-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-30B | decapoda-research/llama-30b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-65B | decapoda-research/llama-65b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-2-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-2-13B | meta-llama/Llama-2-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-2-70B | meta-llama/Llama-2-70b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
 | OPT-6.7B | facebook/opt-6.7b | [OPT-125M](https://huggingface.co/facebook/opt-125m) |
 | OPT-13B | facebook/opt-13b | [OPT-125M](https://huggingface.co/facebook/opt-125m) |
 | OPT-30B | facebook/opt-30b | [OPT-125M](https://huggingface.co/facebook/opt-125m) |

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -32,6 +32,19 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
       STRING "Choose the type of build." FORCE)
 endif()
 
+if(INSTALL_DIR)
+  message(STATUS "INSTALL_DIR: ${INSTALL_DIR}")
+  set(CMAKE_INSTALL_PREFIX ${INSTALL_DIR} CACHE PATH "Installation directory" FORCE)
+else()
+  # Install DIR not set. Use default, unless a conda environment is active
+  if (DEFINED ENV{CONDA_PREFIX} AND NOT FF_BUILD_FROM_PYPI)
+    set(CONDA_PREFIX $ENV{CONDA_PREFIX})
+    # Set CMAKE_INSTALL_PREFIX to the Conda environment's installation path
+    set(CMAKE_INSTALL_PREFIX ${CONDA_PREFIX} CACHE PATH "Installation directory" FORCE)
+    message(STATUS "Active conda environment detected. Setting CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}")
+  endif()
+endif()
+
 # do not disable assertions even if in release mode
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -UNDEBUG")
 

diff --git a/SERVE.md b/SERVE.md
@@ -117,7 +117,7 @@ A C++ example is available at [this folder](../inference/spec_infer/). After bui
 * `-ll:fsize`: size of device memory on each GPU in MB
 * `-ll:zsize`: size of zero-copy memory (pinned DRAM with direct GPU access) in MB. FlexFlow Serve keeps a replica of the LLM parameters on zero-copy memory, and therefore requires that the zero-copy memory is sufficient for storing the LLM parameters.
 * `-llm-model`: the LLM model ID from HuggingFace (e.g. "meta-llama/Llama-2-7b-hf")
-* `-ssm-model`: the SSM model ID from HuggingFace (e.g. "JackFram/llama-160m-base"). You can use multiple `-ssm-model`s in the command line to launch multiple SSMs.
+* `-ssm-model`: the SSM model ID from HuggingFace (e.g. "JackFram/llama-160m"). You can use multiple `-ssm-model`s in the command line to launch multiple SSMs.
 * `-cache-folder`: the folder
 * `-data-parallelism-degree`, `-tensor-parallelism-degree` and `-pipeline-parallelism-degree`: parallelization degrees in the data, tensor, and pipeline dimensions. Their product must equal the number of GPUs available on the machine. When any of the three parallelism degree arguments is omitted, a default value of 1 will be used. 
 * `-prompt`: (optional) path to the prompt file. FlexFlow Serve expects a json format file for prompts. In addition, users can also use the following API for registering requests:
@@ -157,13 +157,13 @@ Below is a list of models that we have explicitly tested and for which a SSM may
 
 | Model | Model id on HuggingFace | Boost-tuned SSMs |
 | :---- | :---- | :---- |
-| LLaMA-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) |
-| LLaMA-13B | decapoda-research/llama-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) |
-| LLaMA-30B | decapoda-research/llama-30b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) |
-| LLaMA-65B | decapoda-research/llama-65b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) |
-| LLaMA-2-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) |
-| LLaMA-2-13B | meta-llama/Llama-2-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) |
-| LLaMA-2-70B | meta-llama/Llama-2-70b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) |
+| LLaMA-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-13B | decapoda-research/llama-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-30B | decapoda-research/llama-30b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-65B | decapoda-research/llama-65b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-2-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-2-13B | meta-llama/Llama-2-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-2-70B | meta-llama/Llama-2-70b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
 | OPT-6.7B | facebook/opt-6.7b | [OPT-125M](https://huggingface.co/facebook/opt-125m) |
 | OPT-13B | facebook/opt-13b | [OPT-125M](https://huggingface.co/facebook/opt-125m) |
 | OPT-30B | facebook/opt-30b | [OPT-125M](https://huggingface.co/facebook/opt-125m) |

diff --git a/config/config.inc b/config/config.inc
@@ -24,7 +24,7 @@ fi
 
 #set installation dir
 if [ -n "$INSTALL_DIR" ]; then
-  SET_INSTALL_DIR="-DCMAKE_INSTALL_PREFIX=${INSTALL_DIR}"
+  SET_INSTALL_DIR="-DINSTALL_DIR=${INSTALL_DIR}"
 fi
 
 if [ "$INFERENCE_TESTS" = "ON" ]; then

diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h
@@ -186,11 +186,10 @@ enum OperatorType {
 enum ModelType {
   UNKNOWN = 3001,
   LLAMA = 3002,
-  LLAMA2 = 3003,
-  OPT = 3004,
-  FALCON = 3005,
-  STARCODER = 3006,
-  MPT = 3007
+  OPT = 3003,
+  FALCON = 3004,
+  STARCODER = 3005,
+  MPT = 3006
 };
 
 enum PMParameter {

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
@@ -177,14 +177,7 @@ void FlexFlow::top_level_task(Task const *task,
   auto architectures = model_config["architectures"];
   for (auto const &str : architectures) {
     if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") {
-      std::string nameOrPath = model_config["_name_or_path"];
-      // TODO: support LLAMA-2 models not from Meta
-      bool llama2 = nameOrPath.find("meta-llama/Llama-2") == 0;
-      if (llama2) {
-        model_type = ModelType::LLAMA2;
-      } else {
-        model_type = ModelType::LLAMA;
-      }
+      model_type = ModelType::LLAMA;
       break;
     } else if (str == "OPTForCausalLM") {
       model_type = ModelType::OPT;
@@ -220,7 +213,7 @@ void FlexFlow::top_level_task(Task const *task,
   rm->register_output_filepath(file_paths.output_file_path);
 
   FFModel model(ffconfig, ffconfig.cpu_offload);
-  if (model_type == ModelType::LLAMA || model_type == ModelType::LLAMA2) {
+  if (model_type == ModelType::LLAMA) {
     LLAMA::create_llama_model(model,
                               config_filepath,
                               weights_filepath,

diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py
@@ -68,7 +68,7 @@ def get_configs():
             "ssms": [
                 {
                     # required ssm parameter
-                    "ssm_model": "JackFram/llama-160m-base",
+                    "ssm_model": "JackFram/llama-160m",
                     # optional ssm parameters
                     "cache_path": "",
                     "refresh_cache": False,

diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
@@ -166,14 +166,7 @@ void get_model_meta(FilePaths &file_paths,
   auto architectures = llm_model_config["architectures"];
   for (auto const &str : architectures) {
     if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") {
-      std::string nameOrPath = llm_model_config["_name_or_path"];
-      // TODO: support LLAMA-2 models not from Meta
-      bool llama2 = nameOrPath.find("meta-llama/Llama-2") == 0;
-      if (llama2) {
-        model_metadata.llm_model_type = ModelType::LLAMA2;
-      } else {
-        model_metadata.llm_model_type = ModelType::LLAMA;
-      }
+      model_metadata.llm_model_type = ModelType::LLAMA;
       break;
     } else if (str == "OPTForCausalLM") {
       model_metadata.llm_model_type = ModelType::OPT;
@@ -223,14 +216,7 @@ void get_model_meta(FilePaths &file_paths,
     auto architectures = ssm_model_config["architectures"];
     for (auto const &str : architectures) {
       if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") {
-        std::string nameOrPath = ssm_model_config["_name_or_path"];
-        // TODO: support LLAMA-2 models not from Meta
-        bool llama2 = nameOrPath.find("meta-llama/Llama-2") == 0;
-        if (llama2) {
-          ssm_model_type = ModelType::LLAMA2;
-        } else {
-          ssm_model_type = ModelType::LLAMA;
-        }
+        ssm_model_type = ModelType::LLAMA;
         break;
       } else if (str == "OPTForCausalLM") {
         ssm_model_type = ModelType::OPT;
@@ -318,8 +304,7 @@ void FlexFlow::top_level_task(Task const *task,
 
   // Create LLM model
   FFModel tree_model(ffconfig, ffconfig.cpu_offload);
-  if (model_metadata.llm_model_type == ModelType::LLAMA ||
-      model_metadata.llm_model_type == ModelType::LLAMA2) {
+  if (model_metadata.llm_model_type == ModelType::LLAMA) {
     LLAMA::create_llama_model(tree_model,
                               model_metadata.llm_model_config_path,
                               model_metadata.llm_weights_path,
@@ -363,8 +348,7 @@ void FlexFlow::top_level_task(Task const *task,
 
   for (int ssm_id = 0; ssm_id < num_ssms; ssm_id++) {
     FFModel &beam_model = ssm_models[ssm_id];
-    if (model_metadata.ssm_model_types[ssm_id] == ModelType::LLAMA ||
-        model_metadata.ssm_model_types[ssm_id] == ModelType::LLAMA2) {
+    if (model_metadata.ssm_model_types[ssm_id] == ModelType::LLAMA) {
       LLAMA::create_llama_model(beam_model,
                                 model_metadata.ssm_model_config_paths[ssm_id],
                                 model_metadata.ssm_model_weights_paths[ssm_id],

diff --git a/python/flexflow/type.py b/python/flexflow/type.py
@@ -75,11 +75,10 @@ class InferenceMode(Enum):
 class ModelType(Enum):
     UNKNOWN = 3001
     LLAMA = 3002
-    LLAMA2 = 3003
-    OPT = 3004
-    FALCON = 3005
-    STARCODER = 3006
-    MPT = 3007
+    OPT = 3003
+    FALCON = 3004
+    STARCODER = 3005
+    MPT = 3006
 
 
 class OpType(Enum):

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
@@ -115,7 +115,7 @@ void RequestManager::register_tokenizer(ModelType type,
   this->eos_token_id = eos_token_id;
   std::string tokenizer_folder =
       (!path.empty() && path.back() != '/') ? path + '/' : path;
-  if (model_type == ModelType::LLAMA || model_type == ModelType::LLAMA2) {
+  if (model_type == ModelType::LLAMA) {
     bool path_to_file = !path.empty() &&
                         (path.size() >= strlen("tokenizer.model")) &&
                         path.find("tokenizer.model") ==
@@ -416,6 +416,12 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
                           old_bc.requestsInfo[i].request_guid,
                           request.tokens.size());
         std::string output = this->tokenizer_->Decode(request.tokens);
+        // Unlike Huggingface, the sentencepiece C++ library automatically
+        // removes the BOS token
+        if (model_type == ModelType::LLAMA &&
+            request.tokens.at(0) == bos_token_id) {
+          output = "<s> " + output;
+        }
 
         {
           // update generation result and trigger future
@@ -625,6 +631,12 @@ BeamSearchBatchConfig
                           request.guid,
                           request.tokens.size());
         std::string output = this->tokenizer_->Decode(request.tokens);
+        // Unlike Huggingface, the sentencepiece C++ library automatically
+        // removes the BOS token
+        if (model_type == ModelType::LLAMA &&
+            request.tokens.at(0) == bos_token_id) {
+          output = "<s> " + output;
+        }
         {
           // update generation result and trigger future
           GenerationResult &gr = request_generation_results[request.guid];
@@ -736,6 +748,12 @@ BeamSearchBatchConfig
           }
         }
         std::string output = this->tokenizer_->Decode(request.tokens);
+        // Unlike Huggingface, the sentencepiece C++ library automatically
+        // removes the BOS token
+        if (model_type == ModelType::LLAMA &&
+            request.tokens.at(0) == bos_token_id) {
+          output = "<s> " + output;
+        }
         log_req_mgr.print("Output: %s", output.c_str());
       }
     } else if (request.status == Request::PENDING) {
@@ -769,6 +787,12 @@ BeamSearchBatchConfig
 
       // Token Info
       std::string output = this->tokenizer_->Decode(request.tokens);
+      // Unlike Huggingface, the sentencepiece C++ library automatically removes
+      // the BOS token
+      if (model_type == ModelType::LLAMA &&
+          request.tokens.at(0) == bos_token_id) {
+        output = "<s> " + output;
+      }
       log_req_mgr.print("Output: %s", output.c_str());
     } else {
       assert(false);