Merge pull request #5 from csunny/fix

fix some issue
eosphoros-ai · Jul 9, 2023 · ab43955 · ab43955
2 parents 1f1b7e2 + fa92656
commit ab43955
Show file tree

Hide file tree

Showing 6 changed files with 70 additions and 106 deletions.
diff --git a/README.md b/README.md
@@ -53,14 +53,19 @@ The approximate hardware resources required to quantize and fine-tune the model
 
 #### Spider+QLoRA+Falcon
 
-This experimental project builds the dataset by adding table structure information and adjusting the parameters of the language model, and then fine-tunes the base model with methods such as QLoRA, aiming to reduce the cost of fine-tuning while improving the accuracy and speed of SQL generation.
+This experimental project builds a dataset by adding table structure information, adjusting the parameters of the language model and then fine-tuning the Falcon model with QLoRA, aiming to reduce the cost of fine-tuning while increasing the accuracy and speed of SQL generation. This can be executed with the following command:
+
+```shell
+sh . /scripts/spider_falcon_finetune.sh
+```
 
 ## 3. Usage
 
 ### 3.1. Environment preparation
 
 ```
 git clone https://github.com/csunny/DB-GPT-Hub.git
+cd DB-GPT-Hub
 pip install -r requirements.txt 
 conda create -n dbgpt_hub python=3.10 
 conda activate dbgpt_hub
@@ -140,7 +145,7 @@ SQL_PROMPT_DICT = {
 Model fine-tuning uses the qlora method, where we can run the following command to fine-tune the model:
 
 ```bash
-python train_qlora.py --model_name_or_path <path_or_name>
+python src/train/train_qlora.py --model_name_or_path <path_or_name>
 ```
 
 The fine-tuned model weights will be saved to the output folder by default
@@ -150,7 +155,7 @@ The fine-tuned model weights will be saved to the output folder by default
 Run the following command to generate the final merged model:
 
 ```bash
-python merge_peft_adapters.py --base_model_name_or_path <path_or_name>
+python src/utils/merge_peft_adapters.py --base_model_name_or_path <path_or_name>
 ```
 
 ## 4. The development path

diff --git a/README.zh.md b/README.zh.md
@@ -53,14 +53,19 @@ DB-GPT-HUB目前支持的base模型有：
 
 #### Spider+QLoRA+Falcon
 
-该实验项目通过加入表结构信息、调整语言模型的参数等方式构建数据集，然后用QLoRA等方法对base模型进行微调，旨在降低微调成本的同时提高SQL生成的准确性和速度。
+该实验项目通过加入表结构信息、调整语言模型的参数等方式构建数据集，然后用QLoRA对Falcon模型进行微调，旨在降低微调成本的同时提高SQL生成的准确性和速度。可以通过以下命令来执行：
+
+```shell
+sh ./scripts/spider_falcon_finetune.sh
+```
 
 ## 三、使用方法
 
 ### 3.1、环境准备
 
 ```
 git clone https://github.com/csunny/DB-GPT-Hub.git
+cd DB-GPT-Hub
 pip install -r requirements.txt 
 conda create -n dbgpt_hub python=3.10 
 conda activate dbgpt_hub
@@ -139,7 +144,7 @@ SQL_PROMPT_DICT = {
 模型微调使用的是qlora方法，我们可以运行以下命令来微调模型：
 
 ```bash
-python train_qlora.py --model_name_or_path <path_or_name>
+python src/train/train_qlora.py --model_name_or_path <path_or_name>
 ```
 
 微调后的模型权重会默认保存到output文件夹下面
@@ -149,7 +154,7 @@ python train_qlora.py --model_name_or_path <path_or_name>
 运行以下命令来生成最终合并的模型：
 
 ```bash
-python merge_peft_adapters.py --base_model_name_or_path <path_or_name>
+python src/utils/merge_peft_adapters.py --base_model_name_or_path <path_or_name>
 ```
 
 ## 四、发展路线

diff --git a/requirements.txt b/requirements.txt
@@ -1,80 +1,13 @@
-torch==2.0.0
-accelerate==0.16.0
-aiohttp==3.8.4
-aiosignal==1.3.1
-async-timeout==4.0.2
-attrs==22.2.0
+transformers @ git+https://github.com/huggingface/transformers.git
+peft @ git+https://github.com/huggingface/peft.git
+accelerate @ git+https://github.com/huggingface/[email protected]
 bitsandbytes==0.39.0
-cchardet==2.1.7
-chardet==5.1.0
-contourpy==1.0.7
-cycler==0.11.0
-filelock==3.9.0
-fonttools==4.38.0
-frozenlist==1.3.3
-huggingface-hub==0.14.1
-importlib-resources==5.12.0
+einops==0.6.1
+evaluate==0.4.0
+scikit-learn==1.2.2
+sentencepiece==0.1.99
+wandb==0.15.3
+rapidfuzz
+scipy
+datasets
 
-sqlparse==0.4.4
-kiwisolver==1.4.4
-matplotlib==3.7.1
-multidict==6.0.4
-packaging==23.0
-psutil==5.9.4
-pycocotools==2.0.6
-pyparsing==3.0.9
-python-dateutil==2.8.2
-pyyaml==6.0
-tokenizers==0.13.2
-tqdm==4.64.1
-transformers==4.30.0
-timm==0.6.13
-spacy==3.5.3
-webdataset==0.2.48
-yarl==1.8.2
-zipp==3.14.0
-omegaconf==2.3.0
-opencv-python==4.7.0.72
-iopath==0.1.10
-tenacity==8.2.2
-peft
-pycocoevalcap
-cpm_kernels
-umap-learn
-notebook
-gradio==3.23
-gradio-client==0.0.8
-wandb
-llama-index==0.5.27
-pymysql
-unstructured==0.6.3
-grpcio==1.47.5
-gpt4all==0.3.0
-diskcache==5.6.1
-
-auto-gpt-plugin-template
-pymdown-extensions
-gTTS==2.3.1
-langchain
-nltk
-python-dotenv==1.0.0
-pymilvus==2.2.1
-vcrpy
-chromadb==0.3.22
-markdown2
-colorama
-playsound
-distro
-pypdf
-weaviate-client
-
-# Testing dependencies
-pytest
-asynctest
-pytest-asyncio
-pytest-benchmark
-pytest-cov
-pytest-integration
-pytest-mock
-pytest-recording
-pytesseract==0.3.10
diff --git a/scripts/spider_falcon_finetune.sh b/scripts/spider_falcon_finetune.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+python src/sql_data_process.py 
+
+python src/train/train_qlora.py \
+    --output_dir ./adapter \
+    --dataset spider \
+    --do_train True \
+    --do_eval False \
+    --do_merge True \ 
+    --source_max_len 384 \
+    --target_max_len 128 \
+    --per_device_train_batch_size 4 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 4 \
+    --logging_steps 2 \
+    --max_steps 10 \
+    --save_strategy steps \
+    --data_seed 42 \
+    --save_steps 1000 \
+    --save_total_limit 40 \
+    --evaluation_strategy steps \
+    --eval_dataset_size 1024 \
+    --max_eval_samples 1000 \
+    --eval_steps 10 \
+    --optim paged_adamw_32bit \
+
+python src/utils/merge_peft_adapters.py
diff --git a/src/train/train_qlora.py b/src/train/train_qlora.py
@@ -11,7 +11,7 @@
 import logging
 import bitsandbytes as bnb
 import pandas as pd
-
+import argparse
 import torch
 import transformers
 from torch.nn.utils.rnn import pad_sequence
@@ -36,25 +36,18 @@
 )
 from peft.tuners.lora import LoraLayer
 from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
-ROOT_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-sys.path.append(ROOT_PATH)
-from pilot.configs.config import Config
-from pilot.configs.model_config import LLM_MODEL_CONFIG
 
 torch.backends.cuda.matmul.allow_tf32 = True
-
 logger = logging.getLogger(__name__)
 
 IGNORE_INDEX = -100
 DEFAULT_PAD_TOKEN = "[PAD]"
-
-CFG = Config()
-model_path = LLM_MODEL_CONFIG[CFG.LLM_MODEL]
+model_path = os.path.join("./model", os.listdir("model")[1])
 
 @dataclass
 class ModelArguments:
     model_name_or_path: Optional[str] = field(
-        default=model_path
+        default=model_path 
     )
     trust_remote_code: Optional[bool] = field(
         default=True,
@@ -170,7 +163,7 @@ class TrainingArguments(transformers.Seq2SeqTrainingArguments):
         default='none',
         metadata={"help": "To use wandb or something else for reporting."}
     )
-    output_dir: str = field(default='./train/output', metadata={"help": 'The output dir for logs and checkpoints'})
+    output_dir: str = field(default='./adapter', metadata={"help": 'The output dir for logs and checkpoints'})
     optim: str = field(default='paged_adamw_32bit', metadata={"help": 'The optimizer to be used'})
     per_device_train_batch_size: int = field(default=1, metadata={"help": 'The training batch size per GPU. Increase for better speed.'})
     gradient_accumulation_steps: int = field(default=16, metadata={"help": 'How many gradients to accumulate before to perform an optimizer step'})
@@ -374,7 +367,7 @@ def smart_tokenizer_and_embedding_resize(
     """
     num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
     model.resize_token_embeddings(len(tokenizer))
-
+    
     if num_new_tokens > 0:
         input_embeddings = model.get_input_embeddings().weight.data
         output_embeddings = model.get_output_embeddings().weight.data
@@ -553,7 +546,7 @@ def load_data(dataset_name):
         elif dataset_name == 'vicuna':
             raise NotImplementedError("Vicuna data was not released.")
         elif dataset_name == 'spider':
-            return load_dataset("json", data_files="train/sql_fintune_data.json")
+            return load_dataset("json", data_files="sql_fintune_data.json")
         else:
             if os.path.exists(dataset_name):
                 try:
@@ -833,6 +826,8 @@ def on_evaluate(self, args, state, control, model, **kwargs):
     if (args.do_train or args.do_eval or args.do_predict):
         with open(os.path.join(args.output_dir, "metrics.json"), "w") as fout:
             fout.write(json.dumps(all_metrics))
-
+            
 if __name__ == "__main__":
-    train()
+    train()
+
+
diff --git a/src/utils/merge_peft_adapters.py b/src/utils/merge_peft_adapters.py
@@ -7,16 +7,14 @@
 
 ROOT_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 sys.path.append(ROOT_PATH)
-from pilot.configs.config import Config
-from pilot.configs.model_config import LLM_MODEL_CONFIG
-CFG = Config()
-model_path = LLM_MODEL_CONFIG[CFG.LLM_MODEL]
+
+model_path = os.path.join("./model", os.listdir("model")[1])
 
 def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--base_model_name_or_path", type=str, default=model_path)
-    parser.add_argument("--peft_model_path", type=str, default="train/output/checkpoint-10/adapter_model")
-    parser.add_argument("--output_dir", type=str, default="train/output/merged_models/")
+    parser.add_argument("--peft_model_path", type=str, default="./adapter/checkpoint-10/adapter_model")
+    parser.add_argument("--output_dir", type=str, default="./merged_models")
     parser.add_argument("--device", type=str, default="cpu")
 
     return parser.parse_args()
@@ -45,4 +43,4 @@ def main():
     print(f"Model saved to {args.output_dir}")
 
 if __name__ == "__main__" :
-    main()
+    main()