diff --git a/.gitignore b/.gitignore index 1f27fad..9ad84a2 100644 --- a/.gitignore +++ b/.gitignore @@ -68,6 +68,9 @@ instance/ # Scrapy stuff: .scrapy +test.py +tests/ +aresdocs/ # Sphinx documentation docs/_build/ diff --git a/ares/LLM_as_a_Judge_Adaptation/Generate_Synthetic_Queries_and_Answers.py b/ares/LLM_as_a_Judge_Adaptation/Generate_Synthetic_Queries_and_Answers.py index f879356..c070ada 100644 --- a/ares/LLM_as_a_Judge_Adaptation/Generate_Synthetic_Queries_and_Answers.py +++ b/ares/LLM_as_a_Judge_Adaptation/Generate_Synthetic_Queries_and_Answers.py @@ -492,6 +492,94 @@ def save_synthetic_queries(documents: pd.DataFrame, filename: str) -> None: documents.to_csv(filename, index=False, sep="\t") print("Saved synthetic queries to: " + filename) +def query_decomposition_post_processing(synthetic_queries_filename: str): + # Read the synthetic queries from the specified file + synth_queries = pd.read_csv(synthetic_queries_filename, sep="\t") + + # Drop any duplicated columns + synth_queries = synth_queries.loc[:, ~synth_queries.columns.duplicated()] + + model_name = "google/flan-t5-xl" + tokenizer = AutoTokenizer.from_pretrained(model_name) + model = AutoModelForSeq2SeqLM.from_pretrained(model_name) + + decomposed_data = [] + for _, row in synth_queries.iterrows(): + # Decompose the complex query + simple_queries = decompose_query_with_model(row["Queries"], tokenizer, model) + + # Add a new row for each decomposed query + for simple_query in simple_queries: + new_row = row.copy() + new_row["Queries"] = simple_query + decomposed_data.append(new_row) + + # Replace the original processing loop with the batch processing function + synth_queries = pd.DataFrame(decomposed_data) + + # Save the new synth queries to the file + save_synthetic_queries(synth_queries, synthetic_queries_filename) + +def decompose_query_with_model(query: str, tokenizer, model): + """ + Provided a lightweight model decompose a given query into subqueries. + + Parameters: + - query (str): The query to be decomposed. + - tokenizer (): Tokenizer for model + - model (model): LM used to process query decomposition + + Returns: + - list: A list of resultant queries that are in the question. + """ + + input_text = f""" + You are an expert at decomposing questions. At the end of this prompt I have provided you a query. + This query could be decomposed into simple queries. It can be decomposed if the original query has multiple questions. + If there are not multiple questions then return the original query. If the original query has multiple questions + return the multiple questions in the format below. Be very cautious to not repeat any queries, this is very important. + There may be no simple queries and there may be many simple queries. + + The output should be all the questions split by commas. + There should be no other information. Do not have double quotes either. + + The following are examples of a complex query being decomposed into simple queries. + + Examples: + + Decompose: "What were Einstein’s key theories and how did they influence nuclear technology?" + - "What were Einstein’s key theories?" + - "How did Einstein's key theories influence nuclear technology?" + + Decompose: "Explain the concept of quantum entanglement and its potential applications." + - "Explain the concept of quantum entanglement." + - "What are the potential applications of quantum entanglement?" + + Decompose: "Describe the process of photosynthesis and its importance to the ecosystem." + - "What is the process of photosynthesis?" + - "Why is photosynthesis important to the ecosystem?" + + Decompose: "How did the industrial revolution shape modern economies, influence technology, and society?" + - "How did the industrial revolution shape modern economies?" + - "How did the industrial revolution influence technology?" + - "How did the industrial revolution influence society?" + + Decompose: "What is the first letter of the alphabet?" + - "What is the first letter of the alphabet?" + + Decompose: "How many cows are in America?" + "How many cows are in America?" + + Here is the query to decompose: {query}""" + inputs = tokenizer(input_text, return_tensors="pt", truncation=True) + + # Generate the model output + outputs = model.generate(**inputs, max_length=128, num_return_sequences=1) + + # Decode the generated text into a list of simple queries + simple_queries = [tokenizer.decode(output, skip_special_tokens=True).replace(' - ', ',').split(',') for output in outputs] + return simple_queries[0] + def generate_synthetic_queries(documents: pd.DataFrame, settings: dict) -> pd.DataFrame: """ Generate synthetic queries using the FLAN approach. diff --git a/ares/LLM_as_a_Judge_Adaptation/LLM_Synthetic_Generation.py b/ares/LLM_as_a_Judge_Adaptation/LLM_Synthetic_Generation.py index d66ebcb..a1fcd15 100644 --- a/ares/LLM_as_a_Judge_Adaptation/LLM_Synthetic_Generation.py +++ b/ares/LLM_as_a_Judge_Adaptation/LLM_Synthetic_Generation.py @@ -261,6 +261,25 @@ def generate_synthetic_query_vllm_approach(document: str, synthetic_query_prompt def generate_synthetic_answer_api_approach(document: str, question: str, synthetic_answer_prompt: str, prompt: str, length_of_fewshot_prompt: int, model_name: str, for_fever_dataset=False, for_wow_dataset=False): + """ + Generates synthetic answers using a model's API based on the provided document and question. + + This function constructs an answer dynamically using the api and model provided. + + Args: + document (str): The document text based on which the contradictory answer is to be generated. + question (str): The question text based on the document. + synthetic_answer_prompt (str): The initial prompt text to which the document and question will be appended. + fewshot_examples (str): Few-shot examples to include in the prompt for the API. + api_url (str): The API endpoint URL. + api_key (str): The API key for authentication. + model_name (str): The model name to be used in the API. + for_fever_dataset (bool, optional): Flag to indicate if the function is being used for the FEVER dataset. Defaults to False. + for_wow_dataset (bool, optional): Flag to indicate if the function is being used for the WoW dataset. Defaults to False. + + Returns: + str: The generated answer text. + """ # Construct the prompt without the document based on the dataset type prompt_without_document = prompt + "Example " + str(length_of_fewshot_prompt + 1) + ":\n" if for_fever_dataset: @@ -317,12 +336,31 @@ def generate_synthetic_answer_api_approach(document: str, question: str, synthet return final_response except Exception as e: - print(f"Error generating synthetic queries: {e}") + print(f"Error generating synthetic answers: {e}") continue def generate_synthetic_answer_azure_approach(document: str, question: str, synthetic_answer_prompt: str, prompt: str, length_of_fewshot_prompt: int, azure_openai_config: dict, for_fever_dataset=False, for_wow_dataset=False): + """ + Generates synthetic answers using provided Azure OpenAI model based on the provided document and question. + + This function constructs an answer dynamically using the api and model provided. + + Args: + document (str): The document text based on which the contradictory answer is to be generated. + question (str): The question text based on the document. + synthetic_answer_prompt (str): The initial prompt text to which the document and question will be appended. + fewshot_examples (str): Few-shot examples to include in the prompt for the API. + api_url (str): The API endpoint URL. + api_key (str): The API key for authentication. + model_name (str): The model name to be used in the API. + for_fever_dataset (bool, optional): Flag to indicate if the function is being used for the FEVER dataset. Defaults to False. + for_wow_dataset (bool, optional): Flag to indicate if the function is being used for the WoW dataset. Defaults to False. + + Returns: + str: The generated answer text. + """ # Construct the prompt without the document based on the dataset type prompt_without_document = prompt + "Example " + str(length_of_fewshot_prompt + 1) + ":\n" if for_fever_dataset: @@ -382,12 +420,31 @@ def generate_synthetic_answer_azure_approach(document: str, question: str, synth return final_response except Exception as e: - print(f"Error generating synthetic queries: {e}") + print(f"Error generating synthetic answers: {e}") continue def generate_synthetic_answer_vllm_approach(document: str, question: str, synthetic_answer_prompt: str, prompt: str, length_of_fewshot_prompt: int, model_name: str, host_url: str, for_fever_dataset=False, - for_wow_dataset=False): + for_wow_dataset=False): + """ + Generates synthetic answers using an API model based on the provided document and question. + + This function constructs an answer dynamically using the vllm provided at the host url. + + Args: + document (str): The document text based on which the contradictory answer is to be generated. + question (str): The question text based on the document. + synthetic_answer_prompt (str): The initial prompt text to which the document and question will be appended. + fewshot_examples (str): Few-shot examples to include in the prompt for the API. + api_url (str): The API endpoint URL. + api_key (str): The API key for authentication. + model_name (str): The model name to be used in the API. + for_fever_dataset (bool, optional): Flag to indicate if the function is being used for the FEVER dataset. Defaults to False. + for_wow_dataset (bool, optional): Flag to indicate if the function is being used for the WoW dataset. Defaults to False. + + Returns: + str: The generated answer text. + """ # Construct the prompt without the document based on the dataset type prompt_without_document = prompt + "Example " + str(length_of_fewshot_prompt + 1) + ":\n" if for_fever_dataset: @@ -446,7 +503,7 @@ def generate_synthetic_answer_vllm_approach(document: str, question: str, synthe return final_response except Exception as e: - print(f"Error generating synthetic queries: {e}") + print(f"Error generating synthetic answers: {e}") continue def generate_synthetic_contradictory_answers_api_approach(document: str, question: str, synthetic_contradictory_answer_prompt: str, fewshot_examples: str, diff --git a/ares/RAG_Automatic_Evaluation/LLMJudge_RAG_Compared_Scoring.py b/ares/RAG_Automatic_Evaluation/LLMJudge_RAG_Compared_Scoring.py index a44f21c..4359906 100644 --- a/ares/RAG_Automatic_Evaluation/LLMJudge_RAG_Compared_Scoring.py +++ b/ares/RAG_Automatic_Evaluation/LLMJudge_RAG_Compared_Scoring.py @@ -3,9 +3,10 @@ T5Tokenizer, T5EncoderModel, T5ForConditionalGeneration, BertModel, AutoTokenizer, AutoModel, GPT2Tokenizer, TrainingArguments, get_scheduler, - AutoModelForCausalLM, AutoConfig, AutoModelForSequenceClassification, - MptForSequenceClassification + AutoModelForCausalLM, AutoConfig, AutoModelForSequenceClassification, + MptForSequenceClassification, AutoModelForSeq2SeqLM ) + import sys import pandas as pd import numpy as np @@ -449,7 +450,7 @@ def filter_dataset(rag_type: str = "question_answering") -> tuple[str, str, str] return context_relevance_system_prompt, answer_faithfulness_system_prompt, answer_relevance_system_prompt -def preprocess_data(test_set_selection: str, label_column: str, labels: list): +def preprocess_data(test_set_selection: str, label_column: str, labels: list, query_decomposition: bool): """ Preprocesses the data for evaluation. @@ -457,6 +458,7 @@ def preprocess_data(test_set_selection: str, label_column: str, labels: list): - test_set_selection (str): The file path to the test set selection in CSV format. - label_column (str): The column name in the test set that contains the labels. - labels (list): A list of labels to be used for filtering the test set. + - query_decomposition (bool): When processing the data do we want to decompose queries? Returns: - Tuple[pd.DataFrame, str]: A tuple containing the preprocessed test set DataFrame and the name of the text column. @@ -488,15 +490,95 @@ def preprocess_data(test_set_selection: str, label_column: str, labels: list): # Filter out rows where the text column has the value "Error" test_set = test_set[test_set[text_column] != "Error"] + # Check if the dataset has fewer than 10 rows after filtering if len(test_set) < 10: raise ValueError("Insufficient Data: Dataset has fewer than 10 rows after filtering!") + + if query_decomposition: + model_name = "google/flan-t5-xl" + tokenizer = AutoTokenizer.from_pretrained(model_name) + model = AutoModelForSeq2SeqLM.from_pretrained(model_name) + + decomposed_data = [] + for _, row in df.iterrows(): + # Decompose the complex query + simple_queries = decompose_query_with_model(row["Queries"], tokenizer, model) + + # Add a new row for each decomposed query + for simple_query in simple_queries: + new_row = row.copy() + new_row["Queries"] = simple_query + decomposed_data.append(new_row) + + # Replace the original processing loop with the batch processing function + test_set = pd.DataFrame(decomposed_data) return test_set, text_column ############################################################ +def decompose_query_with_model(query, tokenizer, model): + """ + Provided a lightweight model decompose a given query into subqueries. + + Parameters: + - query (str): The query to be decomposed. + - tokenizer (): Tokenizer for model + - model (model): LM used to process query decomposition + + Returns: + - list: A list of resultant queries that are in the question. + """ + + input_text = f""" + You are an expert at decomposing questions. At the end of this prompt I have provided you a query. + This query could be decomposed into simple queries. It can be decomposed if the original query has multiple questions. + If there are not multiple questions then return the original query. If the original query has multiple questions + return the multiple questions in the format below. Be very cautious to not repeat any queries, this is very important. + There may be no simple queries and there may be many simple queries. + + The output should be all the questions split by commas. + There should be no other information. Do not have double quotes either. + + The following are examples of a complex query being decomposed into simple queries. + + Examples: + + Decompose: "What were Einstein’s key theories and how did they influence nuclear technology?" + - "What were Einstein’s key theories?" + - "How did Einstein's key theories influence nuclear technology?" + + Decompose: "Explain the concept of quantum entanglement and its potential applications." + - "Explain the concept of quantum entanglement." + - "What are the potential applications of quantum entanglement?" + + Decompose: "Describe the process of photosynthesis and its importance to the ecosystem." + - "What is the process of photosynthesis?" + - "Why is photosynthesis important to the ecosystem?" + + Decompose: "How did the industrial revolution shape modern economies, influence technology, and society?" + - "How did the industrial revolution shape modern economies?" + - "How did the industrial revolution influence technology?" + - "How did the industrial revolution influence society?" + + Decompose: "What is the first letter of the alphabet?" + - "What is the first letter of the alphabet?" + + Decompose: "How many cows are in America?" + "How many cows are in America?" + + Here is the query to decompose: {query}""" + inputs = tokenizer(input_text, return_tensors="pt", truncation=True) + + # Generate the model output + outputs = model.generate(**inputs, max_length=128, num_return_sequences=1) + + # Decode the generated text into a list of simple queries + simple_queries = [tokenizer.decode(output, skip_special_tokens=True).replace(' - ', ',').split(',') for output in outputs] + return simple_queries[0] + def togetherai_list_models(api_key: str) -> list: """ Lists available models from the Together API. diff --git a/ares/ares.py b/ares/ares.py index 8cc4d23..6743e5d 100644 --- a/ares/ares.py +++ b/ares/ares.py @@ -34,10 +34,11 @@ class ARES: "request_delay": (int, 0), # Optional with default "vllm": (bool, False), # Optional with default "azure_openai_config": (dict, None), # Optional with default - "host_url": (str, "None") # Optional with default + "host_url": (str, "None"), # Optional with default }, "synthetic_query_generator": { + "query_decomposition": (bool, False), # Optional with default "document_filepaths": (list, None), # Required parameter with no default value "few_shot_prompt_filenames": (list, None), # Required parameter with no default value "synthetic_queries_filenames": (list, None), # Required parameter with no default value @@ -60,7 +61,7 @@ class ARES: "synthetic_query_prompt": (str, "You are an expert question-answering system. Generate only one question based on the provided document. Ensure the question is answerable within the context of the document. Do not generate multiple questions. Do not provide labels, headers, or additional text. Only return a single, clear question. Generating more than one question will be considered incorrect output.\n\n"), "synthetic_valid_answer_prompt": (str, "You are an expert question-answering system. You must create an answer for the provided question. The answer must be answerable within the context of the document. Return only the answer, nothing else.\n\n"), "synthetic_contradictory_answer_prompt": (str, "Create an answer for the given question that contradicts the provided document. You should create false information that disagrees with what exists within the content of the document. Return only the false answer, without any labels or additional text.\n\n"), - "azure_openai_config": (dict, None) # Optional with default + "azure_openai_config": (dict, None), # Optional with default }, "classifier_model": { @@ -79,7 +80,7 @@ class ARES: "number_of_runs": (int, 1), # Optional with default "num_warmup_steps": (int, 100), # Optional with default "training_row_limit": (int, -1), # Optional with default - "validation_row_limit": (int, -1) # Optional with default + "validation_row_limit": (int, -1), # Optional with default }, "ppi": { @@ -103,7 +104,8 @@ class ARES: "machine_label_llm_model": (str, "None"), # Optional with default "gold_machine_label_path": (str, "None"), # Optional with default "prediction_filepaths": (list, ["None"]), # Optional with default - "azure_openai_config": (dict, None) # Optional with default + "azure_openai_config": (dict, None), # Optional with default + "query_decomposition": (bool, False), # Optional with default } # azure_openai_config dictionary is formatted with 3 entries: diff --git a/ares/rag_scoring.py b/ares/rag_scoring.py index 5617e9c..7b64c86 100644 --- a/ares/rag_scoring.py +++ b/ares/rag_scoring.py @@ -18,7 +18,7 @@ def rag_scoring_config(alpha, num_trials, evaluation_datasets, few_shot_examples_filepath, checkpoints, labels, model_choice, llm_judge, assigned_batch_size, number_of_labels, gold_label_paths, rag_type, vllm, host_url, request_delay, debug_mode, - machine_label_llm_model, gold_machine_label_path, prediction_filepaths, azure_openai_config): + machine_label_llm_model, gold_machine_label_path, prediction_filepaths, azure_openai_config, query_decomposition): """ Configures and runs the RAG scoring process. @@ -43,6 +43,7 @@ def rag_scoring_config(alpha, num_trials, evaluation_datasets, few_shot_examples - gold_machine_label_path: Path to the gold machine labels. - prediction_filepaths: List of file paths to save predictions. - azure_openai_config: Dictionary of information to setup Azure model + - query_decomposition: Will query decomposition be used when evaluating """ if few_shot_examples_filepath == "None" and (llm_judge != "None" or machine_label_llm_model != "None"): @@ -91,7 +92,7 @@ def rag_scoring_config(alpha, num_trials, evaluation_datasets, few_shot_examples context_relevance_system_prompt, answer_faithfulness_system_prompt, answer_relevance_system_prompt = filter_dataset(rag_type) - test_set, text_column = preprocess_data(test_set_selection, label_column, labels) + test_set, text_column = preprocess_data(test_set_selection, label_column, labels, query_decomposition) loaded_model = model_loader(checkpoint) if isinstance(loaded_model, tuple): diff --git a/ares/synthetic_generator.py b/ares/synthetic_generator.py index ca383c2..c1cc0a9 100644 --- a/ares/synthetic_generator.py +++ b/ares/synthetic_generator.py @@ -5,12 +5,14 @@ # generate_contradictory_answers, generate_few_shot_prompts, generate_synthetic_queries, + query_decomposition_post_processing, Generate_Synthetic_Answers ) import os def synthetic_generator_config( + query_decomposition: bool, document_filepaths: list, few_shot_prompt_filenames: list, synthetic_queries_filenames: list, @@ -42,12 +44,13 @@ def synthetic_generator_config( "Create an answer for the given question that contradicts the provided document. " "You should create false information that disagrees with what exists within the content of the document.\n\n" ), - azure_openai_config: dict = None + azure_openai_config: dict = None, ) -> None: """ Configures and generates synthetic queries and answers based on the provided parameters. Args: + query_decomposition (bool): Will synthetically generated queries be decomposed. document_filepaths (list): List of file paths to the documents. few_shot_prompt_filenames (list): List of filenames for the few-shot prompts. synthetic_queries_filenames (list): List of filenames for the synthetic queries. @@ -133,6 +136,9 @@ def synthetic_generator_config( generate_synthetic_queries(documents, synthetic_queries_config) + if query_decomposition: + query_decomposition_post_processing(synthetic_queries_filename) + synthetic_answers_config = { 'regenerate_answers': regenerate_answers, 'answer_gen_few_shot_examples': answer_gen_few_shot_examples,