feat: add summarize rlhf codes

congchan · Nov 12, 2023 · 410d0d7 · 410d0d7
1 parent 36baf4e
commit 410d0d7
Show file tree

Hide file tree

Showing 18 changed files with 1,909 additions and 27 deletions.
diff --git a/README.md b/README.md
@@ -4,17 +4,34 @@ A platform for training large language model based multi-turns chatbots.
 Developed based on FastChat, trl, trlx, Huggingface transformers.
 
 ## Features
-* Support Llama, Baichuan, Qwen seriels models.
+* Support training llm models.
 * Characters awared templates.
 * Long sequence length training with [FlashAttention2](https://github.com/HazyResearch/flash-attention). 
 * A stacked dataset class to support long chats training. 
 * Ghost Attention (GAtt) in templates.
 
 ## Installation
 ```python
-pip3 install --no-cache-dir -i https://mirrors.aliyun.com/pypi/simple/ --upgrade -e ".[data]"
+pip3 install -e ".[data]"
 ```
 
+This project make use of transformers trainer to manage training process, and deepspeed to handle data parallelism and zero(1, 2, 3) optimization. Please install transformers and `deepspeed>=0.10.3`.
+
+The project has fully tested with `deepspeed>=0.10.3`, older version of deepspeed may persist unexpected bugs and error.
+
+Since [transformers>=4.34.0](https://github.com/huggingface/transformers/releases), huggingface transformers support flash-attention2 integrated in Llama and Falcon series models. I recommend using `transformers>=4.34.0` instead of wrting extra patches.
+
+Please go to [Dao-AILab/flash-attention](https://github.com/Dao-AILab/flash-attention) and install flash-attention2, or just run below command to install it.
+```
+FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE  pip3 install flash_attn  --no-build-isolation
+```
+
+Some models(such as [Baichuan2](https://github.com/baichuan-inc/Baichuan2)) support [xformers](https://github.com/facebookresearch/xformers), you need to install it by yourself.
+```
+pip3 install -U xformers
+```
+
+
 # Pre-training
 
 # Fine-tuning
@@ -23,5 +40,17 @@ pip3 install --no-cache-dir -i https://mirrors.aliyun.com/pypi/simple/ --upgrade
     python -m llm.data.process_chats --dataset PIPPA
     ```
 
+2. The example training script is `run_finetune.sh`, please replace args values with your actual need. Depend on different hardware and clusters, you will need to handle multi-nodes working by yourself.
+
 
 # Alignment
+I have reproduce Stiennon, Nisan, et al. Learning to Summarize from Human Feedback. arXiv:2009.01325, arXiv, 15 Feb. 2022. arXiv.org, http://arxiv.org/abs/2009.01325.
+
+
+
+# Citation
+* Stiennon, Nisan, et al. Learning to Summarize from Human Feedback. arXiv:2009.01325, arXiv, 15 Feb. 2022. arXiv.org, http://arxiv.org/abs/2009.01325.
+* https://github.com/lm-sys/FastChat
+* https://github.com/CarperAI/trlx
+* https://github.com/lvwerra/trl/
+* https://github.com/huggingface/transformers
diff --git a/llm/alignment/reward_modeling/ds_config_zero2.json b/llm/alignment/reward_modeling/ds_config_zero2.json
@@ -0,0 +1,42 @@
+{
+  "bf16": {
+    "enabled": "auto"
+  },
+  "optimizer": {
+    "type": "AdamW",
+    "params": {
+      "lr": "auto",
+      "betas": "auto",
+      "eps": "auto",
+      "weight_decay": "auto"
+    }
+  },
+  "scheduler": {
+    "type": "WarmupLR",
+    "params": {
+      "warmup_min_lr": "auto",
+      "warmup_max_lr": "auto",
+      "warmup_num_steps": "auto"
+    }
+  },
+  "zero_optimization": {
+    "stage": 2,
+    "offload_optimizer": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "gather_16bit_weights_on_model_save": true,
+    "round_robin_gradients": true,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 5e8,
+    "contiguous_gradients": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 5e8
+  },
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "steps_per_print": 2000,
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "wall_clock_breakdown": false
+}
diff --git a/llm/alignment/reward_modeling/rm_utils.py → ...alignment/reward_modeling/reward_utils.py b/llm/alignment/reward_modeling/rm_utils.py → ...alignment/reward_modeling/reward_utils.py
@@ -191,3 +191,16 @@ def freeze_bottom_causal_layers(model: nn.Module, num_layers_unfrozen):
         layer.requires_grad_(False)
 
     return num_layers, num_layers_unfrozen
+
+
+def make_reward_model(model_name, type_t, tok_path, save_model):
+    if type_t == "classification":
+        config = AutoConfig.from_pretrained(model_name)
+        config.num_labels = 1
+        reward_model = AutoModelForSequenceClassification.from_config(config)
+    elif type_t == "causal":
+        tokenizer = AutoTokenizer.from_pretrained(tok_path)
+        reward_model = RewardModel(model_name, tokenizer(tokenizer.eos_token)["input_ids"][0], save_model)
+    else:
+        raise ValueError("Unsupported reward model type {}".format(type_t))
+    return reward_model
diff --git a/llm/alignment/reward_modeling/train_rm.py → ...alignment/reward_modeling/train_reward.py b/llm/alignment/reward_modeling/train_rm.py → ...alignment/reward_modeling/train_reward.py
@@ -31,38 +31,38 @@
 from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint, get_fp32_state_dict_from_zero_checkpoint
 
 from reward_utils import (
+    RewardModel,
     freeze_bottom_causal_layers,
     SparsePairwiseTrainer,
-    SparsePairwiseShuffleTrainer, PreTrainedRewardModel,
+    SparsePairwiseShuffleTrainer, 
+    PreTrainedRewardModel,
 )
 
-from reward_modeling.rm_utils import get_logger
-
 
 def parse_args():
-    parser = argparse.ArgumentParser(description="Finetune a reward model.")
+    parser = argparse.ArgumentParser(description="Finetune a transformers model on a causal language modeling task")
     parser.add_argument(
         "--dataset_name",
         type=str,
         default=None,
         help="The name or path of the dataset to use (via the datasets library).",
     )
     parser.add_argument(
-        "--init_model_name_or_path",
+        "--sft_model_name_or_path",
         type=str,
         default=None,
-        help="Model path for init a reward model.",
+        help="SFT model path for init a reward model.",
     )
     parser.add_argument(
         "--output_dir",
         type=str,
-        default=None,
+        default="./outputs",
         help="output directory for logs, ckpting, etc..",
     )
     parser.add_argument(
         "--deepspeed_config_file",
         type=str,
-        default="configs/ds_config_zero2.json",
+        default=None,
         help="Specify deepspeed config file.",
     )
     parser.add_argument(
@@ -96,13 +96,13 @@ def parse_args():
     parser.add_argument(
         "--per_device_train_batch_size",
         type=int,
-        default=1,
+        default=8,
         help="The batch size per GPU/TPU core/CPU for training."
     )
     parser.add_argument(
         "--per_device_eval_batch_size",
         type=int,
-        default=1,
+        default=8,
         help="The batch size per GPU/TPU core/CPU for evaluation."
     )
     parser.add_argument(
@@ -166,6 +166,24 @@ def parse_args():
     return args
 
 
+def get_logger(name, to_file, level=logging.INFO):
+    logger = logging.getLogger(name=name)
+    logger.setLevel(level=level)
+    formatter = logging.Formatter('%(asctime)s-%(name)s-%(levelname)s-%(message)s')
+
+    handler = logging.FileHandler(filename=to_file)
+    handler.setLevel(level=level)
+    handler.setFormatter(formatter)
+
+    console = logging.StreamHandler()
+    console.setLevel(level=level)
+    console.setFormatter(formatter)
+
+    logger.addHandler(handler)
+    logger.addHandler(console)
+    return logger
+
+
 class PairwiseDataset(Dataset):
     """
     Turn the dataset into pairs of input + chosen and input + rejected,
@@ -230,11 +248,11 @@ def __getitem__(self, idx):
 
 def compute_metrics(eval_preds):
     preds = eval_preds[0]
-    logger.info(f"Monitor: Shape of preds {preds.shape}")
+    logger.info(f"DEBUG: Shape of preds {preds.shape}")
     preds = np.reshape(preds, (-1, 2))
-    logger.info(f"Monitor: chosen mean {np.mean(preds[:, 0])} vs rejected mean {np.mean(preds[:, 1])}")
+    logger.info(f"DEBUG: chosen mean {np.mean(preds[:, 0])} vs rejected mean {np.mean(preds[:, 1])}")
     for idx in [0, 1, -2, -1]:
-        logger.info(f"Monitor: chosen vs rejected, [{idx}] {preds[idx, 0]}: {preds[idx, 1]}")
+        logger.info(f"DEBUG: chosen vs rejected, [{idx}] {preds[idx, 0]}: {preds[idx, 1]}")
     acc = np.sum(preds[:, 0] >= preds[:, 1]) / preds.shape[0]
     return {"accuracy": acc}
 
@@ -311,7 +329,6 @@ def on_step_end(self, args, state, control, **kwargs):
         bf16=True,
         load_best_model_at_end=True,
         metric_for_best_model="accuracy",
-        save_total_limit=2,
         remove_unused_columns=False,
         logging_first_step=True,
         label_names=["labels"],
@@ -343,7 +360,7 @@ def on_step_end(self, args, state, control, **kwargs):
                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
             )
 
-    tokenizer = AutoTokenizer.from_pretrained(args.init_model_name_or_path)
+    tokenizer = AutoTokenizer.from_pretrained(args.sft_model_name_or_path)
     tokenizer.truncation_side = "left"
     tokenizer.add_special_tokens(
         {"additional_special_tokens": ["<|system|>", "<|assistant|>", "<|user|>", "<|im_end|>"]}
@@ -353,16 +370,17 @@ def on_step_end(self, args, state, control, **kwargs):
         logger.info(f"Set tokenizer.pad_token to: {tokenizer.pad_token}")
     logger.info(f"tokenizer: {tokenizer}")
 
-    hf_config = AutoConfig.from_pretrained(args.init_model_name_or_path)
-    model = AutoModelForSequenceClassification.from_pretrained(args.init_model_name_or_path, trust_remote_code=True)
+    hf_config = AutoConfig.from_pretrained(args.sft_model_name_or_path)
+    model = AutoModelForSequenceClassification.from_pretrained(args.sft_model_name_or_path, trust_remote_code=True)
     model.config.pad_token_id = tokenizer.pad_token_id
     model.resize_token_embeddings(len(tokenizer))
     logger.info(f"Set model pad_token_id to: {model.model.config.pad_token_id}")
+    model.config.max_position_embeddings = args.max_seq_len
     logger.info("Model:")
     logger.info(f"{model}")
     num_layers, num_layers_unfrozen = freeze_bottom_causal_layers(model, args.how_layers_unfrozen)
 
-    logger.info(f"Model: {args.init_model_name_or_path}")
+    logger.info(f"Model: {args.sft_model_name_or_path}")
     logger.info(f"Model num_layers: {num_layers}")
     logger.info(f"Model num_unfrozen: {num_layers_unfrozen}")
 
@@ -426,6 +444,7 @@ def on_step_end(self, args, state, control, **kwargs):
         for checkpoint_subdir in checkpoint_subdirs:
             logger.info(f"{checkpoint_subdir}")
             tokenizer.save_pretrained(checkpoint_subdir)
+            # hf_config.save_pretrained(checkpoint_subdir)
 
         metrics = train_result.metrics
         metrics["train_samples"] = len(train_dataset)
@@ -448,4 +467,4 @@ def on_step_end(self, args, state, control, **kwargs):
         metrics["eval_samples"] = len(val_dataset)
         logger.info(f"Evaluate metrics: {metrics}")
         trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
diff --git a/...hf/configs/default_accelerate_config.yaml → ...accelerate/default_accelerate_config.yaml b/...hf/configs/default_accelerate_config.yaml → ...accelerate/default_accelerate_config.yaml
diff --git a/llm/alignment/summarize_rlhf/configs/accelerate/default_config_zero2.yaml b/llm/alignment/summarize_rlhf/configs/accelerate/default_config_zero2.yaml
@@ -0,0 +1,15 @@
+compute_environment: LOCAL_MACHINE
+deepspeed_config: 
+  deepspeed_config_file: configs/deepspeed/ds_config_zero2.json
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+dynamo_backend: 'NO'
+fsdp_config: {}
+machine_rank: 0
+main_training_function: main
+megatron_lm_config: {}
+num_machines: 1
+num_processes: 1
+rdzv_backend: static
+same_network: true
+use_cpu: false
diff --git a/...onfigs/ds_config_trlx_gptj_summarize.json → ...pspeed/ds_config_trlx_gptj_summarize.json b/...onfigs/ds_config_trlx_gptj_summarize.json → ...pspeed/ds_config_trlx_gptj_summarize.json
diff --git a/llm/alignment/summarize_rlhf/configs/deepspeed/ds_config_zero2.json b/llm/alignment/summarize_rlhf/configs/deepspeed/ds_config_zero2.json
@@ -0,0 +1,29 @@
+{
+    "bf16": {
+      "enabled": true
+    },
+    "zero_optimization": {
+      "stage": 2,
+      "offload_param": {
+        "device": "cpu"
+      },
+      "offload_optimizer": {
+        "device": "cpu"
+      },
+      "gather_16bit_weights_on_model_save": true,
+      "round_robin_gradients": true,
+      "allgather_partitions": true,
+      "allgather_bucket_size": 5e8,
+      "contiguous_gradients": true,
+      "reduce_scatter": true,
+      "reduce_bucket_size": 5e8
+    },
+    "zero_allow_untested_optimizer": true,
+    "zero_force_ds_cpu_optimizer": false,
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
diff --git a/llm/alignment/summarize_rlhf/configs/ppo_tldr.yml b/llm/alignment/summarize_rlhf/configs/ppo_tldr.yml
@@ -1,5 +1,5 @@
 train:
-  seq_length: 550
+  seq_length: 2048
   epochs: 10000
   total_steps: 100000
   batch_size: 4  # train_micro_batch_size_per_gpu
@@ -10,8 +10,6 @@ train:
   pipeline: "PromptPipeline"
   trainer: "AcceleratePPOTrainer"
   tracker: "tensorboard"
-  logging_dir: "./outputs/tldr"
-  checkpoint_dir: "./outputs/tldr"
 
 model:
   model_path: "openai_summarize_tldr_sft"

diff --git a/llm/alignment/summarize_rlhf/trlx_tldr.py b/llm/alignment/summarize_rlhf/trlx_tldr.py
@@ -39,7 +39,7 @@ def parse_args():
     parser.add_argument(
         "--trl_config_file",
         type=str,
-        default="configs/ppo_tldr_6B.yml",
+        default="configs/ppo_tldr.yml",
         help="The config file for trlx trainer.",
     )
     parser.add_argument(