From 1904b2ba826522fa6ec729efe30bbf8b1adef783 Mon Sep 17 00:00:00 2001
From: Mengkejiergeli Ba <mengkejiergeli.ba@intel.com>
Date: Mon, 11 Nov 2024 09:01:29 +0000
Subject: [PATCH] chatglm: Add test and inference example

---
 examples/text-generation/README.md    | 20 ++++++++++++++++++++
 tests/test_text_generation_example.py |  1 +
 2 files changed, 21 insertions(+)

diff --git a/examples/text-generation/README.md b/examples/text-generation/README.md
index 9e7d728205..5ecda87e7f 100755
--- a/examples/text-generation/README.md
+++ b/examples/text-generation/README.md
@@ -89,6 +89,26 @@ python run_generation.py \
 --prompt "Hello world" "How are you?"
 ```
 
+Here is an example for THUDM/glm-4-9b-chat:
+```
+python3 run_generation.py \
+--model_name_or_path THUDM/glm-4-9b-chat \
+--use_hpu_graphs \
+--use_kv_cache \
+--do_sample \
+--bf16 \
+--trim_logits \
+--batch_size 1 \
+--max_input_tokens 1024 \
+--max_new_tokens 512 \
+--reuse_cache \
+--use_flash_attention
+```
+Note that for chatglm2/3, we need to set the env variable to load the corresponding tokenizer:
+```
+GLM=2 or GLM=3
+```
+
 > The batch size should be larger than or equal to the number of prompts. Otherwise, only the first N prompts are kept with N being equal to the batch size.
 
 ### Run Speculative Sampling on Gaudi
diff --git a/tests/test_text_generation_example.py b/tests/test_text_generation_example.py
index ed1a094e47..51b080be24 100644
--- a/tests/test_text_generation_example.py
+++ b/tests/test_text_generation_example.py
@@ -60,6 +60,7 @@
             ("openbmb/MiniCPM3-4B", 1, False, 65.116, False),
             ("baichuan-inc/Baichuan2-7B-Chat", 1, True, 108, False),
             ("baichuan-inc/Baichuan2-13B-Chat", 1, False, 66, False),
+            ("THUDM/glm-4-9b-chat", 1, True, 105, False),
         ],
         "fp8": [
             ("tiiuae/falcon-180B", 4, 950, True, 128, 128, 2506.68),