Skip to content

Commit

Permalink
baichuan model ok with python client
Browse files Browse the repository at this point in the history
  • Loading branch information
lambda7xx committed Oct 10, 2023
1 parent 15ece9c commit 0085702
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 4 deletions.
24 changes: 24 additions & 0 deletions baicuan.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import flexflow.serve as ff

# Initialize the FlexFlow runtime. ff.init() takes a dictionary (as a positional argument) or named key-value parameters
ff.init(
num_gpus=4,
memory_per_gpu=14000,
zero_copy_memory_per_node=30000,
tensor_parallelism_degree=4,
pipeline_parallelism_degree=1
)

# Create the FlexFlow LLM
llm = ff.LLM("baichuan-inc/Baichuan-7B")

# Create the sampling configs
generation_config = ff.GenerationConfig(
do_sample=True, temperature=0.9, topp=0.8, topk=1
)

# Compile the LLM for inference and load the weights into memory
llm.compile(generation_config)

# Generation begins!
result = llm.generate("Here are some travel tips for Tokyo:\n")
5 changes: 3 additions & 2 deletions inference/utils/download_hf_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def main(args):
data_types = (ff.DataType.DT_FLOAT, ff.DataType.DT_HALF)

data_types = (ff.DataType.DT_HALF, ff.DataType.DT_HALF)
data_types = [ff.DataType.DT_HALF]
for model_name in args.model_names:
for data_type in data_types:
llm = ff.LLM(
Expand All @@ -51,9 +52,9 @@ def main(args):
cache_path=args.cache_folder,
refresh_cache=args.refresh_cache,
)
llm.download_hf_weights_if_needed()
#llm.download_hf_weights_if_needed()
llm.download_hf_tokenizer_if_needed()
llm.download_hf_config()
#llm.download_hf_config()


if __name__ == "__main__":
Expand Down
3 changes: 3 additions & 0 deletions istall.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
pip3 install .

pip3 install torch==1.13
1 change: 1 addition & 0 deletions python/flexflow/serve/models/baichuan.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def __init__(self, hf_config):
self.intermediate_size = hf_config.intermediate_size
self.num_attention_heads = hf_config.num_attention_heads
self.max_position_embeddings = hf_config.max_position_embeddings
self.num_key_value_heads = hf_config.num_attention_heads

class FlexFlowBAICHUAN(FlexFlowModel):
def __init__(
Expand Down
5 changes: 3 additions & 2 deletions python/flexflow/serve/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ def download_hf_weights_if_needed(self):
if not os.path.exists(self.model_name) or os.path.isdir(self.model_name):
print("Done downloading HF weights. Converting them now...")
# Convert the model to FlexFlow format
print(f"self.model_class:{self.mode_clas}")
#print(f"self.model_class:{self.model_clas}")
self.model_class.convert_hf_model(hf_model, self.weights_path)
# Save new revision hash to file
with open(ff_revision_file, "w+") as f:
Expand Down Expand Up @@ -386,7 +386,7 @@ def compile(

# Download the weights and tokenizer from huggingface (if needed) and load them
self.__load_hf_weights()
self.download_hf_tokenizer_if_needed()
#self.download_hf_tokenizer_if_needed()

# Create tokenizer (this must be done after we have downloaded the tokenizer
bos_token_id = (
Expand All @@ -395,6 +395,7 @@ def compile(
eos_token_id = (
-1 if self.hf_config.eos_token_id is None else self.hf_config.eos_token_id
)
self.tokenizer_path = "/home/lambda/.cache/flexflow/tokenizers/baichuan-inc/baichuan-7b"
self.rm.register_tokenizer(
self.model_type, bos_token_id, eos_token_id, self.tokenizer_path
)
Expand Down
1 change: 1 addition & 0 deletions src/runtime/request_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ void RequestManager::register_tokenizer(ModelType type,
(path.size() - strlen("tokenizer.model"));
std::string tokenizer_filepath =
path_to_file ? path : tokenizer_folder + "tokenizer.model";
std::cout<<"tokenizer_filepath:"<<tokenizer_filepath<<std::endl;
this->tokenizer_ =
Tokenizer::FromBlobSentencePiece(LoadBytesFromFile(tokenizer_filepath));
} else if (model_type == ModelType::OPT) {
Expand Down

0 comments on commit 0085702

Please sign in to comment.