chatglm: Add pretrain example

huggingface · Nov 13, 2024 · 615f749 · 615f749
1 parent 1b9e9bf
commit 615f749
Show file tree

Hide file tree

Showing 2 changed files with 37 additions and 3 deletions.
diff --git a/examples/language-modeling/README.md b/examples/language-modeling/README.md
@@ -131,6 +131,34 @@ python ../gaudi_spawn.py \
 This example has been validated with the following DeepSpeed ZeRO-2 config: https://github.com/huggingface/optimum-habana/blob/main/tests/configs/deepspeed_zero_2.json
 
 
+### Multi-card Training with Deepspeed (chatglm3-6b)
+Note that we need to set GLM=3 to load chatglm3 tokenizer, and GLM=2 for chatglm2; If GLM is not set, glm4 tokenizer will be mapped by default.
+```bash
+GLM=3 python ../gaudi_spawn.py \
+    --world_size 8 --use_deepspeed run_clm.py \
+    --config_name THUDM/chatglm3-6b \
+    --tokenizer_name THUDM/chatglm3-6b \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --per_device_train_batch_size 4 \
+    --per_device_eval_batch_size 4 \
+    --do_train \
+    --do_eval \
+    --deepspeed llama2_ds_zero3_config.json \
+    --output_dir /tmp/test-clm \
+    --gaudi_config_name Habana/gpt2 \
+    --use_habana \
+    --use_lazy_mode \
+    --throughput_warmup_steps 3 \
+    --bf16 \
+    --block_size 1024 \
+    --use_cache False \
+    --overwrite_output_dir \
+    --logging_first_step True \
+    --logging_steps 20
+```
+
+
 ## Multi-Node Training with Deepspeed (GPT-NeoX)
 
 The following command triggers the fine-tuning of [GPT-NeoX-20B](https://huggingface.co/EleutherAI/gpt-neox-20b) on WikiText-2 with Deepspeed ZeRO-2.

diff --git a/examples/language-modeling/run_clm.py b/examples/language-modeling/run_clm.py
@@ -431,6 +431,10 @@ def main():
             config.update_from_string(model_args.config_overrides)
             logger.info(f"New config: {config}")
 
+    # Note that chatglm2/3 has float16 dtype from config.json, and on Gaudi we need to use bfloat16.
+    if config.model_type == "chatglm":
+        config.dtype = "torch.bfloat16"
+
     tokenizer_kwargs = {
         "cache_dir": model_args.cache_dir,
         "use_fast": model_args.use_fast_tokenizer,
@@ -472,9 +476,11 @@ def main():
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
     # on a small vocab and want a smaller embedding size, remove this test.
-    embedding_size = model.get_input_embeddings().weight.shape[0]
-    if len(tokenizer) > embedding_size:
-        model.resize_token_embeddings(len(tokenizer))
+    # Skip this test for chatglm pretrain.
+    if config.model_type not in ("chatglm"):
+        embedding_size = model.get_input_embeddings().weight.shape[0]
+        if len(tokenizer) > embedding_size:
+            model.resize_token_embeddings(len(tokenizer))
 
     # Preprocessing the datasets.
     # First we tokenize all the texts.