From 6189cf54b35f9c3b12018c9f036272887a08631b Mon Sep 17 00:00:00 2001 From: wongjingping Date: Tue, 17 Oct 2023 05:43:36 +0000 Subject: [PATCH] pass args to run_hf_eval (standardize) -d as a boolean flag to opt in for private data fix openai runner to able to use private data fix hf runner to use adapter --- .gitignore | 1 + README.md | 2 +- eval/hf_runner.py | 76 ++++++++++++++++++++++++-------------- eval/openai_runner.py | 18 +-------- main.py | 16 ++------ query_generators/openai.py | 6 ++- utils/pruning.py | 2 + utils/questions.py | 3 +- 8 files changed, 64 insertions(+), 60 deletions(-) diff --git a/.gitignore b/.gitignore index 28e9c0f..a95a631 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ data/postgres data/embeddings.pkl +results # pycache **/__pycache__/ diff --git a/README.md b/README.md index 55e5721..cf542c3 100644 --- a/README.md +++ b/README.md @@ -144,7 +144,7 @@ You can use the following flags in the command line to change the configurations | -g, --model_type | Model type used. Make sure this matches the model used. Currently defined options in `main.py` are `oa` for OpenAI models and `hf` for Hugging Face models. | | -m, --model | Model that will be tested and used to generate the queries. Currently defined options for OpenAI models are chat models `gpt-3.5-turbo-0613` and `gpt-4-0613`, and non-chat model `text-davinci-003`. For Hugging Face models, simply use the path of your chosen model (e.g. `defog/sqlcoder`). | | -f, --prompt_file | Markdown file with the prompt used for query generation. | -| -d, --use_defog_data | Use this to toggle between using the public data or your own private data. | +| -d, --use_private_data | Use this to read from your own private data library. | | -o, --output_file | Output CSV file that will store your results. | | -p, --parallel_threads | The default no. of parallel threads is 5. Decrease this to 1 for gpt-4 to avoid the rate limit error. Parallelization support is currently only defined for OpenAI models. | | -t, --timeout_gen | No. of seconds before timeout occurs for query generation. The default is 30.0s. | diff --git a/eval/hf_runner.py b/eval/hf_runner.py index b04d6ba..6530623 100644 --- a/eval/hf_runner.py +++ b/eval/hf_runner.py @@ -1,3 +1,4 @@ +from typing import Optional from eval.eval import compare_query_results import pandas as pd import torch @@ -28,29 +29,28 @@ def generate_prompt(prompt_file, question, db_name, public_data): return prompt -def get_tokenizer_model(model_name): - if "llama" not in model_name: - # tokenizer = AutoTokenizer.from_pretrained(model_name) - # model = AutoModelForCausalLM.from_pretrained( - # model_name, - # trust_remote_code=True, - # torch_dtype=torch.float16, - # device_map="auto", - # use_cache=True - # ) - model_path = "/home/defog/finetuning/starcoder/sqlcoder_npl_cfc_map_600" - config = PeftConfig.from_pretrained(model_path) +def get_tokenizer_model(model_name: Optional[str], adapter_path: Optional[str]): + """ + Load a HuggingFace tokenizer and model. + You may supply either a normal huggingface model name, or a peft adapter path. + """ + if adapter_path is not None: + print(f"Loading adapter model {adapter_path}") + config = PeftConfig.from_pretrained(adapter_path) tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path) model = AutoModelForCausalLM.from_pretrained( config.base_model_name_or_path, - use_auth_token=True, - torch_dtype=torch.bfloat16, + torch_dtype=torch.float16, trust_remote_code=True, + use_cache=True, device_map="auto", ) - model = PeftModel.from_pretrained(model, model_path) + print(f"Loading adapter {adapter_path}") + model = PeftModel.from_pretrained(model, adapter_path) model = model.merge_and_unload() - else: + print(f"Merged adapter {adapter_path}") + elif model_name is not None and "llama" in model_name: + print(f"Loading Llama-based model {model_name}") tokenizer = LlamaTokenizer.from_pretrained( model_name, legacy=False, use_fast=True ) @@ -61,19 +61,36 @@ def get_tokenizer_model(model_name): use_cache=True, use_flash_attention_2=True, ) + else: + print(f"Loading model {model_name}") + tokenizer = AutoTokenizer.from_pretrained(model_name) + model = AutoModelForCausalLM.from_pretrained( + model_name, + torch_dtype=torch.float16, + trust_remote_code=True, + device_map="auto", + ) return tokenizer, model -def run_hf_eval( - questions_file: str, - prompt_file: str, - num_questions: int = None, - public_data: bool = True, - model_name: str = "defog/starcoder-finetune-v3", - output_file: str = "results.csv", -): +def run_hf_eval(args): + # get params from args + questions_file = args.questions_file + prompt_file = args.prompt_file + num_questions = args.num_questions + public_data = not args.use_private_data + model_name = args.model + adapter_path = args.adapter + output_file = args.output_file + + if model_name is None and adapter_path is None: + raise ValueError( + "You must supply either a model name or an adapter path to run an evaluation." + ) + print("preparing questions...") # get questions + print(f"Using {num_questions} questions from {questions_file}") df = prepare_questions_df(questions_file, num_questions) # create a prompt for each question @@ -86,7 +103,7 @@ def run_hf_eval( print("questions prepared\nnow loading model...") # initialize tokenizer and model - tokenizer, model = get_tokenizer_model(model_name) + tokenizer, model = get_tokenizer_model(model_name, adapter_path) model.tie_weights() print("model loaded\nnow generating and evaluating predictions...") @@ -99,11 +116,11 @@ def run_hf_eval( total_correct = 0 output_rows = [] - if "llama" not in model_name.lower(): + if model_name is None or "llama" not in model_name.lower(): pipeline_config = { "max_new_tokens": 300, "do_sample": False, - "num_beams": 5, + "num_beams": 4, } else: pipeline_config = { @@ -115,12 +132,15 @@ def run_hf_eval( with tqdm(total=len(df)) as pbar: for row in df.to_dict("records"): total_tried += 1 + gc.collect() + torch.cuda.empty_cache() + torch.cuda.synchronize() start_time = time() generated_query = ( pipe( row["prompt"], num_return_sequences=1, - # eos_token_id=eos_token_id, + eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id, **pipeline_config, )[0]["generated_text"] diff --git a/eval/openai_runner.py b/eval/openai_runner.py index 388a4db..47aac1b 100644 --- a/eval/openai_runner.py +++ b/eval/openai_runner.py @@ -12,21 +12,6 @@ def run_openai_eval(args): print("preparing questions...") # get questions question_query_df = prepare_questions_df(args.questions_file, args.num_questions) - qg_class = OpenAIQueryGenerator - # add columns for generated query and metrics - question_query_df["generated_query"] = "" - question_query_df["reason"] = "" - question_query_df["error_msg"] = "" - question_query_df["exact_match"] = 0 - question_query_df["correct"] = 0 - question_query_df["error_query_gen"] = 0 - question_query_df["error_db_exec"] = 0 - question_query_df["timeout"] = 0 - # add custom metrics below: - question_query_df["latency_seconds"] = 0.0 # latency of query generation in seconds - question_query_df["tokens_used"] = 0 # number of tokens used in query generation - - question_query_df.reset_index(inplace=True, drop=True) input_rows = question_query_df.to_dict("records") output_rows = [] @@ -44,11 +29,12 @@ def run_openai_eval(args): "database": db_name, } - qg = qg_class( + qg = OpenAIQueryGenerator( db_creds=copy.deepcopy(db_creds), model=args.model, prompt_file=args.prompt_file, timeout=args.timeout_gen, + use_public_data=not args.use_private_data, verbose=args.verbose, ) diff --git a/main.py b/main.py index c179a6e..4238171 100644 --- a/main.py +++ b/main.py @@ -9,8 +9,9 @@ parser.add_argument("-n", "--num_questions", type=int, default=None) parser.add_argument("-g", "--model_type", type=str, required=True) parser.add_argument("-m", "--model", type=str) + parser.add_argument("-a", "--adapter", type=str) parser.add_argument("-f", "--prompt_file", type=str, required=True) - parser.add_argument("-d", "--use_defog_data", type=bool, default=True) + parser.add_argument("-d", "--use_private_data", action="store_true") parser.add_argument("-o", "--output_file", type=str, required=True) parser.add_argument("-p", "--parallel_threads", type=int, default=5) parser.add_argument("-t", "--timeout_gen", type=float, default=30.0) @@ -28,18 +29,7 @@ args.model = "claude-2" run_anthropic_eval(args) elif args.model_type == "hf": - if args.model is None: - raise ValueError( - "Model must be specified for HF model type. See section on CLI flags in README.md for more details." - ) - run_hf_eval( - questions_file=args.questions_file, - prompt_file=args.prompt_file, - num_questions=args.num_questions, - public_data=args.use_defog_data, - model_name=args.model, - output_file=args.output_file, - ) + run_hf_eval(args) else: raise ValueError( f"Invalid model type: {args.model_type}. Model type must be one of: 'oa', 'hf'" diff --git a/query_generators/openai.py b/query_generators/openai.py index 3b9e49c..1fd24e9 100644 --- a/query_generators/openai.py +++ b/query_generators/openai.py @@ -20,6 +20,7 @@ def __init__( model: str, prompt_file: str, timeout: int, + use_public_data: bool, verbose: bool, **kwargs, ): @@ -27,6 +28,7 @@ def __init__( self.db_name = db_creds["database"] self.model = model self.prompt_file = prompt_file + self.use_public_data = use_public_data self.timeout = timeout self.verbose = verbose @@ -147,7 +149,9 @@ def generate_query(self, question: str) -> dict: user_prompt = user_prompt.format( user_question=question, - table_metadata_string=prune_metadata_str(question, self.db_name), + table_metadata_string=prune_metadata_str( + question, self.db_name, self.use_public_data + ), ) messages = [] diff --git a/utils/pruning.py b/utils/pruning.py index 8976997..33acdbd 100644 --- a/utils/pruning.py +++ b/utils/pruning.py @@ -167,8 +167,10 @@ def prune_metadata_str(question, db_name, public_data=True): root_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) emb_path = os.path.join(root_dir, "data", "embeddings.pkl") if public_data: + print("Loading public data") import defog_data.supplementary as sup else: + print("Loading private data") import defog_data_private.supplementary as sup emb, csv_descriptions = sup.load_embeddings(emb_path) table_metadata_csv = get_md_emb( diff --git a/utils/questions.py b/utils/questions.py index 61fdef0..5f6b151 100644 --- a/utils/questions.py +++ b/utils/questions.py @@ -1,7 +1,8 @@ +from typing import Optional import pandas as pd -def prepare_questions_df(questions_file, num_questions): +def prepare_questions_df(questions_file: str, num_questions: Optional[int] = None): question_query_df = pd.read_csv(questions_file, nrows=num_questions) question_query_df["generated_query"] = "" question_query_df["reason"] = ""