diff --git a/graph_rag/graph_builder/Example/build_with_relic.MD b/graph_rag/graph_builder/Example/build_with_relic.MD new file mode 100644 index 0000000..1b473ab --- /dev/null +++ b/graph_rag/graph_builder/Example/build_with_relic.MD @@ -0,0 +1,143 @@ +# Knowledge Graph with Relik and Llama-Index + +This markdown file demonstrates an experiment in building a knowledge graph using the `Relik` and `Llama-Index` Property Graphs. The steps include coreference resolution with `Spacy`, relation extraction with `Relik`, and knowledge graph construction with `llama-index PropertyGraphs`,stored in `neo4j`. + +## Import Necessary Libraries + +Import the essential libraries required for the experiment. These include NLP tools (`Spacy`, `coreferee`), document readers, large language models (LLMs), embeddings, and Neo4j for graph storage. + +```python +import spacy, coreferee +from llama_index.core import SimpleDirectoryReader +import nest_asyncio +from llama_index.llms.ollama import Ollama +from llama_index.embeddings.huggingface import HuggingFaceEmbedding +from llama_index.core import PropertyGraphIndex +from llama_index.core import Settings +from llama_index.extractors.relik.base import RelikPathExtractor +from llama_index.graph_stores.neo4j import Neo4jPGStore +``` + +## Coreference Resolution Function + +Sets up a function to resolve coreferences in a text. This is crucial for ensuring that the references to entities like "she" or "it" are correctly linked back to their antecedents,removing de-duplication of nodes from knowledge graph. + +```python +coref_nlp = spacy.load('en_core_web_lg') +coref_nlp.add_pipe('coreferee') + +def coref_text(text): + coref_doc = coref_nlp(text) + resolved_text = "" + + for token in coref_doc: + repres = coref_doc._.coref_chains.resolve(token) + if repres: + resolved_text += " " + " and ".join( + [ + t.text + if t.ent_type_ == "" + else [e.text for e in coref_doc.ents if t in e][0] + for t in repres + ] + ) + else: + resolved_text += " " + token.text + + return resolved_text +``` + +### Example Usage of Coreference Resolution + +An example is provided to demonstrate how the `coref_text` function resolves references in the text. + +```python +coref_text("alice is great. she can study for long hours and remember") +# Output: alice is great. alice can study for long hours and remember +``` + +## Load and Process Documents + +The documents are loaded from a specified directory and processed with the coreference resolution function to prepare them for knowledge graph construction. + +```python +documents = SimpleDirectoryReader(input_dir='/content/data').load_data() +len(documents) + +for doc in documents: + doc.text = coref_text(doc.text) +``` + +## Initialize Relik Path Extractor + +Here, the `RelikPathExtractor` is initialized, which will be used to extract relationships between entities from the processed documents. + +```python +relik = RelikPathExtractor( + model="relik-ie/relik-relation-extraction-small", model_config={"skip_metadata": True} +) +``` + +## Set Up Language Model and Embeddings + +This section configures the LLM (`Ollama`) and the embedding model (`HuggingFaceEmbedding`) to be used for generating embeddings for the knowledge graph. + +```python +llm = Ollama(base_url="http://localhost:11434", model="llama3.1") +embed_model = HuggingFaceEmbedding(model_name="microsoft/codebert-base") +Settings.llm = llm +``` + +## Configure Neo4j Graph Store + +Sets up the connection to a Neo4j database, where the knowledge graph will be stored. Ensure to replace the placeholder for the password with your actual Neo4j password. + +```python +username = "neo4j" +password = "*****************************" +url = "neo4j+s://45256b03.databases.neo4j.io" + +graph_store = Neo4jPGStore( + username=username, + password=password, + url=url, + refresh_schema=False +) +``` + +## Build the Knowledge Graph + +Here, the knowledge graph is constructed from the processed documents using the configured tools: `Relik`, `Ollama`, `HuggingFaceEmbedding`, and `Neo4j`. + +```python +index = PropertyGraphIndex.from_documents( + documents, + kg_extractors=[relik], + llm=llm, + embed_model=embed_model, + property_graph_store=graph_store, + show_progress=True, +) +``` +![Alt text](random/visualisation.png) + + +## Query the Knowledge Graph + +Finally, a query engine is created, allowing you to query the knowledge graph. Example queries and their expected outputs are provided. + +```python +query_engine = index.as_query_engine(include_text=True) + +response = query_engine.query("what is keras nlp?") +print(str(response)) + +# Output: Keras NLP provides a simple way to fine-tune pre-trained language models for various natural language processing tasks... +``` + +```python +response = query_engine.query("format for citing keras nlp") +print(str(response)) + +# Output: To cite Keras NLP, you can refer to the following format: KerasNLP. (n.d.). Retrieved from ... +``` diff --git a/graph_rag/graph_builder/Example/random/visualisation.png b/graph_rag/graph_builder/Example/random/visualisation.png new file mode 100644 index 0000000..6562782 Binary files /dev/null and b/graph_rag/graph_builder/Example/random/visualisation.png differ diff --git a/graph_rag/graph_retrieval/README.MD b/graph_rag/graph_retrieval/README.MD index c7333ee..92c74ab 100644 --- a/graph_rag/graph_retrieval/README.MD +++ b/graph_rag/graph_retrieval/README.MD @@ -41,4 +41,37 @@ from graph_rag.graph_retrieval.graph_retrieval import graph_query response = graph_query("Your query here", query_engine) print(response) ``` +## Advanced Training with QLoRA and P-Tuning + +>fine-tuning LLMs on data(masked language or Next toke Prediction) for few epochs, may result in better retrieval and response + +### 1. Setup + +To use QLoRA and P-Tuning, ensure your environment is set up with the required libraries and that your model and dataset configurations are defined in a `config.yaml` file. + +### 2. Finetuning with QLoRA + +Use the QLoRA method for efficient fine-tuning by passing the appropriate configurations in your `config.yaml`. This method is ideal when working with large models on limited hardware. + +```bash +python qlora_adapter.py --config path/to/config.yaml +``` +Execute the training script with the `--config` argument to specify your configuration file: + +### 3. Fine-Tuning with P-Tuning + +P-Tuning allows for parameter-efficient prompt-based fine-tuning. Adjust the number of virtual tokens and other related parameters in the `config.yaml` to customize the training process. + +```bash +python p_tuning.py--config path/to/config.yaml +``` +Execute the training script with the `--config` argument to specify your configuration file: + + + + + + +This will start the training process using the specified method (QLoRA or P-Tuning) and configurations. + diff --git a/graph_rag/graph_retrieval/training_scripts/QLoRA_tuning/config.yaml b/graph_rag/graph_retrieval/training_scripts/QLoRA_tuning/config.yaml new file mode 100644 index 0000000..9654452 --- /dev/null +++ b/graph_rag/graph_retrieval/training_scripts/QLoRA_tuning/config.yaml @@ -0,0 +1,35 @@ +MODEL: + MODEL: "codellama/CodeLlama-7b-Instruct-hf" + SEQ_LENGTH: 2048 + LOAD_IN_8BIT: False + +DATA: + REPO_PATH: '/content/keras-io/templates' + SEED: 0 + EXTENSIONS: [ 'md' ] + OUTPUT_FILE: 'merged_output.txt'# Column name containing the code content + +TRAINING_ARGUMENTS: + BATCH_SIZE: 64 + GR_ACC_STEPS: 1 + LR: 5e-4 + LR_SCHEDULER_TYPE: "cosine" + WEIGHT_DECAY: 0.01 + NUM_WARMUP_STEPS: 30 + EVAL_FREQ: 100 + SAVE_FREQ: 100 + LOG_FREQ: 10 + OUTPUT_DIR: + BF16: True + FP16: False + +LORA: + LORA_R: 8 + LORA_ALPHA: 32 + LORA_DROPOUT: 0.0 + LORA_TARGET_MODULES: + +BNB_CONFIG: + USE_NESTED_QUANT: True + BNB_4BIT_COMPUTE_DTYPE: "bfloat16" + diff --git a/graph_rag/graph_retrieval/training_scripts/QLoRA_tuning/qlora_adapter.py b/graph_rag/graph_retrieval/training_scripts/QLoRA_tuning/qlora_adapter.py new file mode 100644 index 0000000..b4e30cf --- /dev/null +++ b/graph_rag/graph_retrieval/training_scripts/QLoRA_tuning/qlora_adapter.py @@ -0,0 +1,252 @@ +""" +This script facilitates the fine-tuning of a language model using QLoRA (Quantized Low-Rank Adapter) +adapter tuning. + +The main functionalities include: +- Preparing data from a specified repository with specific file extensions. +- Tokenizing the data for model training. +- Loading and configuring a pre-trained language model. +- Applying PEFT (Parameter-Efficient Fine-Tuning) using QLoRA. +- Defining training arguments and creating a Trainer instance. +- Executing the training process with the Trainer. + +Requirements: +- A YAML configuration file that specifies model, training, and data parameters. +""" + +import argparse +import yaml +import os +import glob +import torch +from datasets import Dataset +from transformers import Trainer, DataCollatorForLanguageModeling +from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + Trainer, + TrainingArguments, + BitsAndBytesConfig, +) + + +def prepare_data(repo_path: str, extensions: list, output_file: str): + """ + Collects files with specified extensions from a repository, concatenates their content, and writes it to an output file. + + Args: + repo_path: Path to the repository to collect files from. + extensions: List of file extensions to include in the data preparation. + output_file: Path to the output file where the concatenated content will be saved. + + Returns: + A string containing the entire content written to the output file. + """ + + files = [] + for ext in extensions: + files.extend( + glob.glob(os.path.join(repo_path, "**", f"*.{ext}"), recursive=True) + ) + + with open(output_file, "w", encoding="utf-8") as outfile: + for path in files: + with open(path, "r", encoding="utf-8") as file: + content = file.read() + outfile.write(f"### {path} ###\n") + outfile.write(content) + outfile.write("\n\n") + + with open(output_file, "r") as f: + return f.read() + + +def data_for_training(content: str, config: dict): + """ + Tokenizes the content and prepares it for language model training, including creating a data collator. + + Args: + content: The concatenated text content to be tokenized. + config: Dictionary containing the model and training configuration. + + Returns: + A tuple containing the tokenized dataset,tokenizer,data collator for language model training. + """ + tokenizer = AutoTokenizer.from_pretrained(config["Model"]["model"]) + context_length = config["Model"]["context_length"] + outputs = tokenizer( + content, + truncation=True, + max_length=context_length, + return_overflowing_tokens=True, + return_length=True, + ) + print(f"Input IDs length: {len(outputs['input_ids'])}") + print(f"Input chunk lengths: {outputs['length']}") + print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}") + ds = Dataset.from_dict(outputs) + ds_removed = ds.remove_columns( + ["attention_mask", "length", "overflow_to_sample_mapping"] + ) + tokenizer.pad_token = tokenizer.eos_token + data_collator = DataCollatorForLanguageModeling( + tokenizer, mlm=config["Training"]["masked_language_modelling"] + ) + return ds_removed, data_collator, tokenizer + + +def load_base_model(config: dict): + """ + Loads the base language model with specified configurations, including quantization settings. + + Args: + config: The configuration dictionary containing model and BNB (BitsAndBytes) parameters. + + Returns: + PreTrainedModel: The loaded pre-trained language model ready for training. + """ + + compute_dtype = getattr(torch, config["BNB_CONFIG"]["BNB_4BIT_COMPUTE_DTYPE"]) + + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=compute_dtype, + bnb_4bit_use_double_quant=config["BNB_CONFIG"]["USE_NESTED_QUANT"], + ) + device_map = {"": 0} + + model = AutoModelForCausalLM.from_pretrained( + config["MODEL"]["MODEL"], + load_in_8bit=config["MODEL"]["LOAD_IN_8BIT"], + quantization_config=bnb_config, + device_map=device_map, + use_cache=False, + trust_remote_code=True, + ) + return model + + +def load_peft_model(model: object, config: dict): + """ + Applies PEFT (Parameter-Efficient Fine-Tuning) using QLoRA to the given model. + + Args: + model: The pre-trained language model to be fine-tuned. + config: The configuration dictionary containing LORA (Low-Rank Adapter) parameters. + + Returns: + PreTrainedModel: The PEFT-configured model ready for training. + """ + + model = prepare_model_for_kbit_training(model) + peft_config = LoraConfig( + lora_alpha=config["LORA"]["LORA_ALPHA"], + lora_dropout=config["LORA"]["LORA_DROPOUT"], + r=config["LORA"]["LORA_R"], + bias="none", + task_type="CAUSAL_LM", + # target_modules=, + ) + model = get_peft_model(model, peft_config) + model.print_trainable_parameters() + return model + + +def create_training_arguments(config: dict): + """ + Creates and returns the training arguments for the Trainer. + + Args: + config: The configuration dictionary containing training arguments. + + Returns: + TrainingArguments: The configured training arguments. + """ + + training_args = TrainingArguments( + output_dir=f"results/{config['TRAINING_ARGUMENTS']['OUTPUT_DIR']}", + num_train_epochs=3, + dataloader_drop_last=True, + evaluation_strategy="steps", + save_strategy="steps", + eval_steps=config["TRAINING_ARGUMENTS"]["EVAL_FREQ"], + save_steps=config["TRAINING_ARGUMENTS"]["SAVE_FREQ"], + logging_steps=config["TRAINING_ARGUMENTS"]["LOG_FREQ"], + per_device_train_batch_size=64, + per_device_eval_batch_size=64, + learning_rate=config["TRAINING_ARGUMENTS"]["LR"], + lr_scheduler_type=config["TRAINING_ARGUMENTS"]["LR_SCHEDULER_TYPE"], + warmup_steps=config["TRAINING_ARGUMENTS"]["NUM_WARMUP_STEPS"], + gradient_accumulation_steps=config["TRAINING_ARGUMENTS"]["GR_ACC_STEPS"], + gradient_checkpointing=True, + fp16=config["TRAINING_ARGUMENTS"]["FP16"], + bf16=config["TRAINING_ARGUMENTS"]["BF16"], + weight_decay=config["TRAINING_ARGUMENTS"]["WEIGHT_DECAY"], + # push_to_hub=True, + include_tokens_per_second=True, + ) + return training_args + + +def create_trainer( + tokenizer: object, train_data: object, data_collator: object, model: object +): + """ + Creates a Trainer instance with the provided tokenizer, training data, data collator, and model. + + Args: + tokenizer: The tokenizer to be used during training. + train_data : The tokenized training dataset. + data_collator: The data collator for language modeling. + model : The pre-trained and fine-tuned model. + + Returns: + Trainer: The Trainer instance for model training. + """ + training_args = create_training_arguments() + trainer = Trainer( + model=model, + tokenizer=tokenizer, + args=training_args, + data_collator=data_collator, + train_dataset=train_data, + eval_dataset=train_data, + ) + return trainer + + +def main(): + """ + The main function that orchestrates the data preparation, model loading, + and training processes using the provided YAML configuration. + """ + + parser = argparse.ArgumentParser( + description="Training script for QLoRA adapter tuning" + ) + parser.add_argument( + "--config", type=str, required=True, help="Path to the YAML configuration file" + ) + args = parser.parse_args() + + with open(args.config, "r") as file: + config = yaml.safe_load(file) + + content = prepare_data( + config["Data"]["repo_path"], + config["Data"]["extensions"], + config["Data"]["output_file"], + ) + + train_data, data_collator, tokenizer = data_for_training(content, config) + model = load_base_model(config) + model = load_peft_model(model, config) + trainer = create_trainer(config, tokenizer, train_data, data_collator, model) + + trainer.train() + + +if __name__ == "__main__": + main() diff --git a/graph_rag/graph_retrieval/training_scripts/prompt_tuning/config.yaml b/graph_rag/graph_retrieval/training_scripts/prompt_tuning/config.yaml new file mode 100644 index 0000000..df23185 --- /dev/null +++ b/graph_rag/graph_retrieval/training_scripts/prompt_tuning/config.yaml @@ -0,0 +1,20 @@ +Data: + repo_path : '/content/keras-io/templates' + extensions : ['md'] + output_file : 'merged_output.txt' +ModeL: + model: 'bigcode/starcoderbase-1b' + context_length: 128 +Training: + masked_language_modelling: False + num_virtual_tokens : 4 + num_epochs : 6 + learning_rate: 0.0035 + output_dir: "/" + max_steps: 4 + batch_size: 25 + auto_batch_size : False + push_to_hub: False + + + diff --git a/graph_rag/graph_retrieval/training_scripts/prompt_tuning/p_tuning.py b/graph_rag/graph_retrieval/training_scripts/prompt_tuning/p_tuning.py new file mode 100644 index 0000000..f24d954 --- /dev/null +++ b/graph_rag/graph_retrieval/training_scripts/prompt_tuning/p_tuning.py @@ -0,0 +1,196 @@ +""" +This script prepares data from a repository for training a P-tuning model using the PEFT library. +It reads source files, processes them into tokenized chunks, and trains a language model using the specified configuration. + +Functions: +- prepare_data: Collects files from a repository, concatenates their content, and saves it to an output file. +- data_for_training: Tokenizes the concatenated content and prepares it for language model training. +- get_peft_model: Initializes and configures a P-tuning model using the specified configuration. +- create_training_arguments: Generates training arguments for the Trainer using the configuration settings. +- create_trainer: Creates a Trainer object with the model, data, and training arguments. +- main: Parses the YAML configuration file and runs the training process. + +Requirements: +- A YAML configuration file that specifies model, training, and data parameters. +""" + +import argparse +import yaml +import os +import glob +from datasets import Dataset +from transformers import Trainer, DataCollatorForLanguageModeling +from transformers import AutoModelForCausalLM, AutoTokenizer +from peft import get_peft_model, PromptTuningConfig, TaskType, PromptTuningInit +from transformers import TrainingArguments + + +def prepare_data(repo_path: str, extensions: list, output_file: str): + """ + Collects files with specified extensions from a repository, concatenates their content, and writes it to an output file. + + Args: + repo_path: Path to the repository to collect files from. + extensions: List of file extensions to include in the data preparation. + output_file: Path to the output file where the concatenated content will be saved. + + Returns: + A string containing the entire content written to the output file. + """ + + files = [] + for ext in extensions: + files.extend( + glob.glob(os.path.join(repo_path, "**", f"*.{ext}"), recursive=True) + ) + + with open(output_file, "w", encoding="utf-8") as outfile: + for path in files: + with open(path, "r", encoding="utf-8") as file: + content = file.read() + outfile.write(f"### {path} ###\n") + outfile.write(content) + outfile.write("\n\n") + + with open(output_file, "r") as f: + return f.read() + + +def data_for_training(content: str, config: dict): + """ + Tokenizes the content and prepares it for language model training, including creating a data collator. + + Args: + content: The concatenated text content to be tokenized. + config: Dictionary containing the model and training configuration. + + Returns: + A tuple containing the tokenized dataset and the data collator for language model training. + """ + + tokenizer = AutoTokenizer.from_pretrained(config["Model"]["model"]) + context_length = config["Model"]["context_length"] + outputs = tokenizer( + content, + truncation=True, + max_length=context_length, + return_overflowing_tokens=True, + return_length=True, + ) + print(f"Input IDs length: {len(outputs['input_ids'])}") + print(f"Input chunk lengths: {outputs['length']}") + print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}") + ds = Dataset.from_dict(outputs) + ds_removed = ds.remove_columns( + ["attention_mask", "length", "overflow_to_sample_mapping"] + ) + tokenizer.pad_token = tokenizer.eos_token + data_collator = DataCollatorForLanguageModeling( + tokenizer, mlm=config["Training"]["masked_language_modelling"] + ) + return ds_removed, data_collator + + +def get_peft_model(config: dict): + """ + Initializes and configures a P-tuning model using the specified foundational model and prompt tuning configuration. + + Args: + config: Dictionary containing the model and training configuration. + + Returns: + A P-tuned model ready for training. + """ + + foundational_model = AutoModelForCausalLM.from_pretrained( + config["Model"]["model"], trust_remote_code=True + ) + generation_config = PromptTuningConfig( + task_type=TaskType.CAUSAL_LM, + prompt_tuning_init=PromptTuningInit.RANDOM, + num_virtual_tokens=config["Training"]["num_virtual_tokens"], + tokenizer_name_or_path=config["Model"]["model"], + ) + peft_model_prompt = get_peft_model(foundational_model, generation_config) + peft_model_prompt.print_trainable_parameters() + return peft_model_prompt + + +def create_training_arguments(config: dict): + """ + Creates and configures the training arguments for the Trainer object. + + Args: + config: Dictionary containing the training configuration. + + Returns: + A TrainingArguments object with the specified settings. + """ + + training_args = TrainingArguments( + output_dir=config["Training"]["output_dir"], + save_strategy="steps", + per_device_train_batch_size=config["Training"]["batch_size"], + auto_find_batch_size=config["Training"]["auto_batch_size"], + learning_rate=config["Training"]["learning_rate"], + num_train_epochs=config["Training"]["num_epochs"], + push_to_hub=config["Training"]["push_to_hub"], + ) + return training_args + + +def create_trainer( + config: dict, train_data: object, data_collator: object, model: object +): + """ + Creates a Trainer object for training the model with the provided data and configuration. + + Args: + config: Dictionary containing the training configuration. + train_data: The tokenized dataset to be used for training hf Dataset object. + data_collator: The data collator for handling the tokenized data during training. + model: The P-tuned model to be trained. + + Returns: + A Trainer object configured for training the model. + """ + + training_args = create_training_arguments(config) + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_data, + data_collator=data_collator, + ) + return trainer + + +def main(): + """ + Main function to execute the training pipeline. It parses the YAML configuration file, prepares the data, initializes + the model, and starts the training process. + """ + parser = argparse.ArgumentParser(description="Training script for P-tuning model") + parser.add_argument( + "--config", type=str, required=True, help="Path to the YAML configuration file" + ) + args = parser.parse_args() + + with open(args.config, "r") as file: + config = yaml.safe_load(file) + + content = prepare_data( + config["Data"]["repo_path"], + config["Data"]["extensions"], + config["Data"]["output_file"], + ) + + train_data, data_collator = data_for_training(content, config) + model = get_peft_model(config) + trainer = create_trainer(config, train_data, data_collator, model) + + trainer.train() + + +if __name__ == "__main__": + main()