baichuan model ok with python client

flexflow · Oct 10, 2023 · 0085702 · 0085702
1 parent 15ece9c
commit 0085702
Show file tree

Hide file tree

Showing 6 changed files with 35 additions and 4 deletions.
diff --git a/baicuan.py b/baicuan.py
@@ -0,0 +1,24 @@
+import flexflow.serve as ff
+
+# Initialize the FlexFlow runtime. ff.init() takes a dictionary (as a positional argument) or named key-value parameters
+ff.init(
+        num_gpus=4,
+        memory_per_gpu=14000,
+        zero_copy_memory_per_node=30000,
+        tensor_parallelism_degree=4,
+        pipeline_parallelism_degree=1
+    )
+
+# Create the FlexFlow LLM
+llm = ff.LLM("baichuan-inc/Baichuan-7B")
+
+# Create the sampling configs
+generation_config = ff.GenerationConfig(
+    do_sample=True, temperature=0.9, topp=0.8, topk=1
+)
+
+# Compile the LLM for inference and load the weights into memory
+llm.compile(generation_config)
+
+# Generation begins!
+result = llm.generate("Here are some travel tips for Tokyo:\n")
diff --git a/inference/utils/download_hf_model.py b/inference/utils/download_hf_model.py
@@ -43,6 +43,7 @@ def main(args):
         data_types = (ff.DataType.DT_FLOAT, ff.DataType.DT_HALF)
 
     data_types = (ff.DataType.DT_HALF, ff.DataType.DT_HALF)
+    data_types = [ff.DataType.DT_HALF]
     for model_name in args.model_names:
         for data_type in data_types:
             llm = ff.LLM(
@@ -51,9 +52,9 @@ def main(args):
                 cache_path=args.cache_folder,
                 refresh_cache=args.refresh_cache,
             )
-            llm.download_hf_weights_if_needed()
+            #llm.download_hf_weights_if_needed()
             llm.download_hf_tokenizer_if_needed()
-            llm.download_hf_config()
+            #llm.download_hf_config()
 
 
 if __name__ == "__main__":

diff --git a/istall.sh b/istall.sh
@@ -0,0 +1,3 @@
+pip3 install .
+
+pip3 install torch==1.13
diff --git a/python/flexflow/serve/models/baichuan.py b/python/flexflow/serve/models/baichuan.py
@@ -28,6 +28,7 @@ def __init__(self, hf_config):
         self.intermediate_size = hf_config.intermediate_size
         self.num_attention_heads = hf_config.num_attention_heads
         self.max_position_embeddings = hf_config.max_position_embeddings
+        self.num_key_value_heads = hf_config.num_attention_heads
 
 class FlexFlowBAICHUAN(FlexFlowModel):
     def __init__(

diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
@@ -217,7 +217,7 @@ def download_hf_weights_if_needed(self):
             if not os.path.exists(self.model_name) or os.path.isdir(self.model_name):
                 print("Done downloading HF weights. Converting them now...")
             # Convert the model to FlexFlow format
-            print(f"self.model_class:{self.mode_clas}")
+            #print(f"self.model_class:{self.model_clas}")
             self.model_class.convert_hf_model(hf_model, self.weights_path)
             # Save new revision hash to file
             with open(ff_revision_file, "w+") as f:
@@ -386,7 +386,7 @@ def compile(
 
         # Download the weights and tokenizer from huggingface (if needed) and load them
         self.__load_hf_weights()
-        self.download_hf_tokenizer_if_needed()
+        #self.download_hf_tokenizer_if_needed()
 
         # Create tokenizer (this must be done after we have downloaded the tokenizer
         bos_token_id = (
@@ -395,6 +395,7 @@ def compile(
         eos_token_id = (
             -1 if self.hf_config.eos_token_id is None else self.hf_config.eos_token_id
         )
+        self.tokenizer_path = "/home/lambda/.cache/flexflow/tokenizers/baichuan-inc/baichuan-7b"
         self.rm.register_tokenizer(
             self.model_type, bos_token_id, eos_token_id, self.tokenizer_path
         )

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
@@ -123,6 +123,7 @@ void RequestManager::register_tokenizer(ModelType type,
                             (path.size() - strlen("tokenizer.model"));
     std::string tokenizer_filepath =
         path_to_file ? path : tokenizer_folder + "tokenizer.model";
+    std::cout<<"tokenizer_filepath:"<<tokenizer_filepath<<std::endl;
     this->tokenizer_ =
         Tokenizer::FromBlobSentencePiece(LoadBytesFromFile(tokenizer_filepath));
   } else if (model_type == ModelType::OPT) {