From 1904b2ba826522fa6ec729efe30bbf8b1adef783 Mon Sep 17 00:00:00 2001 From: Mengkejiergeli Ba Date: Mon, 11 Nov 2024 09:01:29 +0000 Subject: [PATCH] chatglm: Add test and inference example --- examples/text-generation/README.md | 20 ++++++++++++++++++++ tests/test_text_generation_example.py | 1 + 2 files changed, 21 insertions(+) diff --git a/examples/text-generation/README.md b/examples/text-generation/README.md index 9e7d728205..5ecda87e7f 100755 --- a/examples/text-generation/README.md +++ b/examples/text-generation/README.md @@ -89,6 +89,26 @@ python run_generation.py \ --prompt "Hello world" "How are you?" ``` +Here is an example for THUDM/glm-4-9b-chat: +``` +python3 run_generation.py \ +--model_name_or_path THUDM/glm-4-9b-chat \ +--use_hpu_graphs \ +--use_kv_cache \ +--do_sample \ +--bf16 \ +--trim_logits \ +--batch_size 1 \ +--max_input_tokens 1024 \ +--max_new_tokens 512 \ +--reuse_cache \ +--use_flash_attention +``` +Note that for chatglm2/3, we need to set the env variable to load the corresponding tokenizer: +``` +GLM=2 or GLM=3 +``` + > The batch size should be larger than or equal to the number of prompts. Otherwise, only the first N prompts are kept with N being equal to the batch size. ### Run Speculative Sampling on Gaudi diff --git a/tests/test_text_generation_example.py b/tests/test_text_generation_example.py index ed1a094e47..51b080be24 100644 --- a/tests/test_text_generation_example.py +++ b/tests/test_text_generation_example.py @@ -60,6 +60,7 @@ ("openbmb/MiniCPM3-4B", 1, False, 65.116, False), ("baichuan-inc/Baichuan2-7B-Chat", 1, True, 108, False), ("baichuan-inc/Baichuan2-13B-Chat", 1, False, 66, False), + ("THUDM/glm-4-9b-chat", 1, True, 105, False), ], "fp8": [ ("tiiuae/falcon-180B", 4, 950, True, 128, 128, 2506.68),