diff --git a/baicuan.py b/baicuan.py new file mode 100644 index 0000000000..3fcbcb7fd7 --- /dev/null +++ b/baicuan.py @@ -0,0 +1,24 @@ +import flexflow.serve as ff + +# Initialize the FlexFlow runtime. ff.init() takes a dictionary (as a positional argument) or named key-value parameters +ff.init( + num_gpus=4, + memory_per_gpu=14000, + zero_copy_memory_per_node=30000, + tensor_parallelism_degree=4, + pipeline_parallelism_degree=1 + ) + +# Create the FlexFlow LLM +llm = ff.LLM("baichuan-inc/Baichuan-7B") + +# Create the sampling configs +generation_config = ff.GenerationConfig( + do_sample=True, temperature=0.9, topp=0.8, topk=1 +) + +# Compile the LLM for inference and load the weights into memory +llm.compile(generation_config) + +# Generation begins! +result = llm.generate("Here are some travel tips for Tokyo:\n") \ No newline at end of file diff --git a/inference/utils/download_hf_model.py b/inference/utils/download_hf_model.py index daa1da605c..92b08cc188 100644 --- a/inference/utils/download_hf_model.py +++ b/inference/utils/download_hf_model.py @@ -43,6 +43,7 @@ def main(args): data_types = (ff.DataType.DT_FLOAT, ff.DataType.DT_HALF) data_types = (ff.DataType.DT_HALF, ff.DataType.DT_HALF) + data_types = [ff.DataType.DT_HALF] for model_name in args.model_names: for data_type in data_types: llm = ff.LLM( @@ -51,9 +52,9 @@ def main(args): cache_path=args.cache_folder, refresh_cache=args.refresh_cache, ) - llm.download_hf_weights_if_needed() + #llm.download_hf_weights_if_needed() llm.download_hf_tokenizer_if_needed() - llm.download_hf_config() + #llm.download_hf_config() if __name__ == "__main__": diff --git a/istall.sh b/istall.sh new file mode 100644 index 0000000000..1a37762c55 --- /dev/null +++ b/istall.sh @@ -0,0 +1,3 @@ +pip3 install . + +pip3 install torch==1.13 \ No newline at end of file diff --git a/python/flexflow/serve/models/baichuan.py b/python/flexflow/serve/models/baichuan.py index 6d21dbc352..fa83cf37ea 100644 --- a/python/flexflow/serve/models/baichuan.py +++ b/python/flexflow/serve/models/baichuan.py @@ -28,6 +28,7 @@ def __init__(self, hf_config): self.intermediate_size = hf_config.intermediate_size self.num_attention_heads = hf_config.num_attention_heads self.max_position_embeddings = hf_config.max_position_embeddings + self.num_key_value_heads = hf_config.num_attention_heads class FlexFlowBAICHUAN(FlexFlowModel): def __init__( diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index 1f5fcdab72..b4eedcef0e 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -217,7 +217,7 @@ def download_hf_weights_if_needed(self): if not os.path.exists(self.model_name) or os.path.isdir(self.model_name): print("Done downloading HF weights. Converting them now...") # Convert the model to FlexFlow format - print(f"self.model_class:{self.mode_clas}") + #print(f"self.model_class:{self.model_clas}") self.model_class.convert_hf_model(hf_model, self.weights_path) # Save new revision hash to file with open(ff_revision_file, "w+") as f: @@ -386,7 +386,7 @@ def compile( # Download the weights and tokenizer from huggingface (if needed) and load them self.__load_hf_weights() - self.download_hf_tokenizer_if_needed() + #self.download_hf_tokenizer_if_needed() # Create tokenizer (this must be done after we have downloaded the tokenizer bos_token_id = ( @@ -395,6 +395,7 @@ def compile( eos_token_id = ( -1 if self.hf_config.eos_token_id is None else self.hf_config.eos_token_id ) + self.tokenizer_path = "/home/lambda/.cache/flexflow/tokenizers/baichuan-inc/baichuan-7b" self.rm.register_tokenizer( self.model_type, bos_token_id, eos_token_id, self.tokenizer_path ) diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 12996c2c4f..3351793b05 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -123,6 +123,7 @@ void RequestManager::register_tokenizer(ModelType type, (path.size() - strlen("tokenizer.model")); std::string tokenizer_filepath = path_to_file ? path : tokenizer_folder + "tokenizer.model"; + std::cout<<"tokenizer_filepath:"<tokenizer_ = Tokenizer::FromBlobSentencePiece(LoadBytesFromFile(tokenizer_filepath)); } else if (model_type == ModelType::OPT) {