diff --git a/docs/source/setup/quickstart.rst b/docs/source/setup/quickstart.rst index ef6c548..8ea88dc 100644 --- a/docs/source/setup/quickstart.rst +++ b/docs/source/setup/quickstart.rst @@ -26,8 +26,30 @@ For additional details, refer to the `installation guide list[str]: for i, result in enumerate(results) ] ) - - # logging.info( - # f"{i}. \033[94mURL: {result.url}\033[0m (Score: \033[95m{result.score:.2f}\033[0m)" - # ) - # logging.info("-" * 50) - # logging.info(f"Title: \033[93m{result.title}\033[0m") - # logging.info(f"Text:\n{result.text}\n") - # # logging.info(f"Metadata:\n{result.metadata}...") - # logging.info("-" * 80) diff --git a/synthesizer/interface/rag/google_search.py b/synthesizer/interface/rag/google_search.py new file mode 100644 index 0000000..c803a39 --- /dev/null +++ b/synthesizer/interface/rag/google_search.py @@ -0,0 +1,438 @@ +import os +from dataclasses import dataclass +from typing import Optional + +from agent_search.core import SERPClient + +from synthesizer.core import RAGProviderName +from synthesizer.interface.base import RAGInterface, RAGProviderConfig +from synthesizer.interface.rag_interface_manager import ( + rag_config, + rag_provider, +) + + +def call_search_engine(query, serpapi_api_key): + from serpapi import GoogleSearch + + params = { + "q": query, + # "location": "California, United States", + "hl": "en", + "gl": "us", + "google_domain": "google.com", + "api_key": serpapi_api_key, + } + + search = GoogleSearch(params) + return search.get_dict() + + +@dataclass +@rag_config +class GoogleSearchRAGConfig(RAGProviderConfig): + """An abstract class to hold the configuration for a RAG provider.""" + + provider_name: RAGProviderName = RAGProviderName.GOOGLE_SEARCH + google_domain: str = "google.com" + api_key: Optional[str] = None + + +# Source - https://github.com/freshllms/freshqa/blob/main/freshprompt.ipynb + + +def is_date(string, fuzzy=False): + """Check if the given string is a date.""" + import dateutil + + # Parse a string into a date and check its validity + try: + dateutil.parser.parse(string, fuzzy=fuzzy) + return True + except ValueError: + return False + + +def extract_source_webpage(link): + # Extract source webpage + return ( + link.strip() + .replace("https://www.", "") + .replace("http://www.", "") + .replace("https://", "") + .replace("http://", "") + .split("/")[0] + ) + + +def simplify_displayed_link(displayed_link): + # Simplify displayed link + if displayed_link is None: + return None + return extract_source_webpage(displayed_link.split(" › ")[0]) + + +def format_search_results(search_data, title_field=None, highlight_field=None): + # Standardize search results as shown in Figure 3 (left) in the paper + field = "snippet_highlighted_words" + if field in search_data and isinstance(search_data[field], list): + search_data[field] = " | ".join(search_data[field]) + + field = "displayed_link" + if field in search_data: + search_data[field] = simplify_displayed_link(search_data[field]) + + # edge case 1 + if search_data.get("type") == "local_time": + source = search_data.get("displayed_link") + date = search_data.get("date") # + title = search_data.get("title") + + snippet = search_data.get("snippet") + if snippet is None and "result" in search_data: + if "extensions" in search_data and isinstance( + search_data["extensions"], list + ): + snippet = "\n\t".join( + [search_data["result"]] + search_data["extensions"] + ) + else: + snippet = search_data["result"] + + highlight = search_data.get("snippet_highlighted_words") + if highlight is None and "result" in search_data: + highlight = search_data["result"] + + # edge case 2 + elif "type" in search_data and search_data["type"] == "population_result": + source = search_data.get("displayed_link") + if source is None and "sources" in search_data: + if ( + isinstance(search_data["sources"], list) + and "link" in search_data["sources"][0] + ): + source = extract_source_webpage( + search_data["sources"][0]["link"] + ) + + date = search_data.get("date") + if date is None and "year" in search_data: + date = search_data["year"] + + title = search_data.get("title") + + snippet = search_data.get("snippet") + if snippet is None and "population" in search_data: + if "place" in search_data: + snippet = "\n\t".join( + [ + f"{search_data['place']} / Population", + ] + + [ + search_data["population"], + ] + ) + else: + snippet = search_data["population"] + + highlight = search_data.get("snippet_highlighted_words") + if highlight is None and "population" in search_data: + highlight = search_data["population"] + + else: + source = search_data.get("displayed_link") + date = search_data.get("date") + + title = ( + search_data.get("title") + if title_field is None + else search_data.get(title_field) + ) + highlight = ( + search_data.get("snippet_highlighted_words") + if highlight_field is None + else search_data.get(highlight_field) + ) + snippet = search_data.get("snippet", "") + + if "rich_snippet" in search_data: + for key in ["top", "bottom"]: + if ( + key in search_data["rich_snippet"] + and "extensions" in search_data["rich_snippet"][key] + ): + snippet = "\n\t".join( + [snippet] + + search_data["rich_snippet"][key]["extensions"] + ) + + if "list" in search_data: + assert isinstance(search_data["list"], list) + snippet = "\n\t".join([snippet] + search_data["list"]) + + if "contents" in search_data and "table" in search_data["contents"]: + tbl = search_data["contents"]["table"] + assert isinstance(tbl, list) + snippet += "\n" + for row in tbl: + snippet += f'\n{",".join(row)}' + + if snippet is not None and snippet.strip() == "": + snippet = None + + return { + "source": source, + "date": date, + "title": title, + "snippet": snippet, + "highlight": highlight, + } + + +def format_questions_and_answers(search_data): + # Standardize questions and answers as shown in Figure 3 (left) in the paper + source = None + if "link" in search_data: + source = extract_source_webpage(search_data["link"]) + + date = None + + title = None + if "question" in search_data: + title = search_data["question"] + + snippet = None + if "answer" in search_data: + snippet = search_data["answer"] + + highlight = None + + return { + "source": source, + "date": date, + "title": title, + "snippet": snippet, + "highlight": highlight, + } + + +def format_knowledge_graph(search_data): + # Standardize knowledge graphs as shown in Figure 3 (left) in the paper + source = None + if "source" in search_data and "link" in search_data["source"]: + source = extract_source_webpage(search_data["source"]["link"]) + + date = None + + title = None + if "title" in search_data: + title = search_data["title"] + if "type" in search_data: + title += f"\n\t{search_data['type']}" + + snippet = "" + for field in search_data: + if ( + (field not in ["title", "type", "kgmid"]) + and ("_link" not in field) + and ("_stick" not in field) + and isinstance(search_data[field], str) + and not search_data[field].startswith("http") + ): + snippet += f"\n\t{field}: {search_data[field]}" + + if snippet.strip() == "": + snippet = None + else: + snippet = snippet.strip() + + highlight = None + + return { + "source": source, + "date": date, + "title": title, + "snippet": snippet, + "highlight": highlight, + } + + +def freshprompt_format( + question, + search_data, + reasoning_and_answer, + num_organic_results, + num_related_questions, + num_questions_and_answers, + num_retrieved_evidences, +): + try: + import pandas as pd + except ImportError as e: + raise ImportError( + f"Error {e}: Please run `pip install pandas` before attempting to use Google Search RAGInterface." + ) + """Build FreshPrompt for each question + + Args: + question: The question to process. + search_data: Search data. + reasoning_and_answer: The reasoning and answer. + num_organic_results: Number of organic results to keep. + num_related_questions: Number of related questions to keep. + num_questions_and_answers: Number of questions and answers to keep. + num_retrieved_evidences: Number of retrieved evidences to keep. + + Returns: + A prompt that incorporates retrieved evidences for each question. + """ + + df = pd.DataFrame( + columns=["source", "date", "title", "snippet", "highlight"] + ) + + # Organic results + organic_results = [None] * num_organic_results + for k in range(num_organic_results): + if ( + "organic_results" in search_data + and len(search_data["organic_results"]) > k + ): + organic_results[k] = format_search_results( + search_data["organic_results"][k] + ) + else: + organic_results[k] = format_search_results({}) + + for d in organic_results[::-1]: + df = pd.concat([df, pd.DataFrame([d])], ignore_index=True) + + # Related questions + related_questions = [None] * num_related_questions + for k in range(num_related_questions): + if ( + "related_questions" in search_data + and len(search_data["related_questions"]) > k + ): + related_questions[k] = format_search_results( + search_data["related_questions"][k], title_field="question" + ) + else: + related_questions[k] = format_search_results({}) + + for d in related_questions[::-1]: + df = pd.concat([df, pd.DataFrame([d])], ignore_index=True) + + # Questions and Answers + questions_and_answers = [None] * num_questions_and_answers + for k in range(num_questions_and_answers): + if ( + "questions_and_answers" in search_data + and len(search_data["questions_and_answers"]) > k + ): + questions_and_answers[k] = format_questions_and_answers( + search_data["questions_and_answers"][k] + ) + else: + questions_and_answers[k] = format_questions_and_answers({}) + + for d in questions_and_answers[::-1]: + df = pd.concat([df, pd.DataFrame([d])], ignore_index=True) + + # Knowledge graph + knowledge_graph = None + if "knowledge_graph" in search_data: + knowledge_graph = format_knowledge_graph( + search_data["knowledge_graph"] + ) + else: + knowledge_graph = format_knowledge_graph({}) + df = pd.concat([df, pd.DataFrame([knowledge_graph])], ignore_index=True) + + # Answer box + answer_box = None + if "answer_box" in search_data: + answer_box = format_search_results( + search_data["answer_box"], highlight_field="answer" + ) + else: + answer_box = format_search_results({}) + df = pd.concat([df, pd.DataFrame([answer_box])], ignore_index=True) + + # Sort by date + # df["date"] = df["date"].apply(lambda x: format_date(x)) + # df["datetime"] = pd.to_datetime(df["date"], errors="coerce") + # df = df.sort_values(by="datetime", na_position="first") + df.replace({pd.NaT: None}, inplace=True) + df = df.dropna(how="all") + + # Select top_k supporting evidences overall + evidences = [] + + for _, row in df.tail(num_retrieved_evidences).iterrows(): + evidences.append( + f"""\n\nsource: {row['source']}\ndate: {row['date']}\ntitle: {row['title']}\nsnippet: {row['snippet']}\nhighlight: {row['highlight']}""" + ) + + return ( + "".join( + [ + f"\n\n\nquery: {question}", + ] + + evidences + ) + + f"\n\nquestion: {question}{reasoning_and_answer}" + ) + + +@rag_provider +class GoogleSearchRAGInterface(RAGInterface): + """A RAG provider that uses Wikipedia as the retrieval source.""" + + provider_name = RAGProviderName.GOOGLE_SEARCH + FORMAT_INDENT = " " + + def __init__( + self, + config: GoogleSearchRAGConfig = GoogleSearchRAGConfig(), + *args, + **kwargs, + ) -> None: + try: + import serpapi + except ImportError as e: + raise ImportError( + f"ImportError: {e}. Note, SERP API must be installed to run RAG with Google Search." + ) + try: + import dateutil + except ImportError as e: + raise ImportError( + f"ImportError: {e}. Note, `python-dateutil` must be installed to run RAG with Google Search." + ) + super().__init__(config) + self.config: GoogleSearchRAGConfig = config + + def get_rag_context(self, query) -> list[str]: + """Get the context for a prompt.""" + api_key = self.config.api_key or os.getenv("SERP_API_KEY") + if not api_key: + raise ValueError( + "No API key provided. Please provide an API key or set the SERP_API_KEY environment variable." + ) + + num_organic_results = 15 + num_related_questions = 3 + num_questions_and_answers = 3 + num_retrieved_evidences = 15 + + return freshprompt_format( + query, + call_search_engine(query, api_key), + "\n answer:", + num_organic_results, + num_related_questions, + num_questions_and_answers, + num_retrieved_evidences, + ) + diff --git a/synthesizer/scripts/run_rag.py b/synthesizer/scripts/run_rag.py new file mode 100644 index 0000000..861c113 --- /dev/null +++ b/synthesizer/scripts/run_rag.py @@ -0,0 +1,96 @@ +from synthesizer.core import LLMProviderName, RAGProviderName +from synthesizer.interface import ( + LLMInterfaceManager, + RAGInterfaceManager, +) +from synthesizer.llm import GenerationConfig +import fire + +# Note, it is understood that the prompt has an issue with redundancy. This will be addressed shortly. +PROMPT = """ +### Instruction: + +### Instruction: + +Prompt: +{query} + +Context: +{rag_context} + +Prompt: +{query} + +Your task is to write a response to the given prompt. If the prompt is clearly a question, then answer it directly. If the prompt is a statement, then the most important related context that you have been given. WRITE THREE DISTINCT SECTIONS `### My Work`, `### My Answer`, and `### My Further Considerations` as shown below. ONLY USE THE PROVIDED CONTEXT to answer the question, and if the context is insufficient then answer with \"Insufficient Context\". Be sure to include at least 5 distinct sources in your final answer.\n\n### My Work\n- **Valid Premise Check:** Ensure that the question contains a valid premise before answering.\n- **Contextual Analysis:** Think step-by-step about the provided context to identify the most important and/or relevant themes covered in the provided information. Be sure to consider at least 5 distinct sources.\n\n### My Answer\n- **Answer with Inline Numbered Citations:** Provide a response in two to three paragraphs, including inline citations to the most relevant evidence from the provided context. Use the following format for inline citations: `[1]`, `[2]`, `[3]`, etc.\n\n### My Further Considerations\n- **Implications and Follow-Ups:** Consider the implications of the question and answer. Identify any related follow-up questions or considerations that come to mind. \n- **Queries** Provide a comma-separated list of Google queries that you would like to see answered in the future.\n\nBegin your work now:\n \n + +### Response: + +### Response: + +""" + + +class RagDemo: + """A demonstration of agent-search + synthesizer RAG pipeline.""" + + def __init__(self): + try: + import synthesizer + except ImportError as e: + raise ImportError( + f"Demo run_rag.py failed with {e}. Please run pip install sciphi-synthesizer before attempting to run this script." + ) + + def run( + self, + query="What is a quantum field theory in curved space time?", + # rag_prompt="### Instruction:\nYour task is to use the context which follows to answer the question with a two paragraph line-item cited response:\n{rag_context}\nBegin your two paragraph answer with line-item citations now:### Response:\n", + # rag parameters + rag_provider_name="agent-search", + rag_api_base="https://api.sciphi.ai", + rag_limit_hierarchical_url_results="50", + rag_limit_final_pagerank_results="20", + # llm parameters + llm_provider_name="sciphi", + llm_model_name="SciPhi/SciPhi-SearchAgent-Alpha-7B", + llm_max_tokens_to_sample=1_024, + llm_temperature=0.1, + llm_top_p=0.95, + ): + + # RAG Provider Settings + rag_interface = RAGInterfaceManager.get_interface_from_args( + RAGProviderName(rag_provider_name), + api_base=rag_api_base, + # limit_hierarchical_url_results=rag_limit_hierarchical_url_results, + # limit_final_pagerank_results=rag_limit_final_pagerank_results, + ) + rag_context = rag_interface.get_rag_context(query) + + # LLM Provider Settings + llm_interface = LLMInterfaceManager.get_interface_from_args( + LLMProviderName(llm_provider_name), + ) + + generation_config = GenerationConfig( + model_name=llm_model_name, + max_tokens_to_sample=llm_max_tokens_to_sample, + temperature=llm_temperature, + top_p=llm_top_p, + # other generation params here ... + ) + + formatted_prompt = PROMPT.format(rag_context=rag_context, query=query) + completion = llm_interface.get_completion( + formatted_prompt, generation_config + ) + print(completion) + + ### Output: + # Fermat's Last Theorem was proven by British mathematician Andrew Wiles in 1994 (Wikipedia). Wiles's proof was based on a special case of the modularity theorem for elliptic curves, along with Ribet's theorem (Wikipedia). The modularity theorem and Fermat's Last Theorem were previously considered inaccessible to proof by contemporaneous mathematicians (Wikipedia). However, Wiles's proof provided a solution to Fermat's Last Theorem, which had remained unproved for over 300 years (PlanetMath). Wiles's proof is widely accepted and has been recognized with numerous awards, including the Abel Prize in 2016 (Wikipedia). + + # It is important to note that Wiles's proof of Fermat's Last Theorem is a mathematical proof and not related to the science fiction novel "The Last Theorem" by Arthur C. Clarke and Frederik Pohl (Wikipedia). The novel is a work of fiction and does not provide a real mathematical proof for Fermat's Last Theorem (Wikipedia). Additionally, there have been other attempts to prove Fermat's Last Theorem, such as Sophie Germain's approach, but Wiles's proof is the most widely accepted and recognized (Math Stack Exchange). + + +if __name__ == "__main__": + fire.Fire(RagDemo)