From d45900f0a79cd07b42bdf47e633ad45dbc96dd80 Mon Sep 17 00:00:00 2001 From: emrgnt-cmplxty <68796651+emrgnt-cmplxty@users.noreply.github.com> Date: Wed, 3 Jan 2024 15:51:46 -0500 Subject: [PATCH] Feature/update 01 02 24 rebased (#136) * update model * tally distribution * update synthesizer project id * update output script * updates * fix scripts --- pyproject.toml | 2 +- synthesizer/core/base.py | 3 +- synthesizer/interface/__init__.py | 16 +- synthesizer/interface/base.py | 10 +- synthesizer/interface/llm/sciphi_interface.py | 4 +- synthesizer/interface/rag/agent_search.py | 26 ++- .../interface/rag/bing_search/__init__.py | 3 + synthesizer/interface/rag/bing_search/base.py | 62 +++++++ .../interface/rag/bing_search/bing_client.py | 172 ++++++++++++++++++ .../interface/rag/bing_search/bing_types.py | 104 +++++++++++ .../rag/{google_search.py => serp_api.py} | 19 +- synthesizer/scripts/run_rag.py | 20 +- 12 files changed, 400 insertions(+), 41 deletions(-) create mode 100644 synthesizer/interface/rag/bing_search/__init__.py create mode 100644 synthesizer/interface/rag/bing_search/base.py create mode 100644 synthesizer/interface/rag/bing_search/bing_client.py create mode 100644 synthesizer/interface/rag/bing_search/bing_types.py rename synthesizer/interface/rag/{google_search.py => serp_api.py} (95%) diff --git a/pyproject.toml b/pyproject.toml index 2bd338f..1e61129 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ authors = ["Owen Colegrove "] license = "Apache-2.0" readme = "README.md" name = 'sciphi-synthesizer' -version = '1.0.3' +version = '1.0.5' packages = [ { include = "synthesizer" } ] diff --git a/synthesizer/core/base.py b/synthesizer/core/base.py index a0ad67b..0bb2e55 100644 --- a/synthesizer/core/base.py +++ b/synthesizer/core/base.py @@ -18,4 +18,5 @@ class LLMProviderName(Enum): class RAGProviderName(Enum): LOCAL = "local" AGENT_SEARCH = "agent-search" - GOOGLE_SEARCH = "google-search" + SERP_API = "serp-api" + BING = "bing" diff --git a/synthesizer/interface/__init__.py b/synthesizer/interface/__init__.py index cde08ee..06ef46a 100644 --- a/synthesizer/interface/__init__.py +++ b/synthesizer/interface/__init__.py @@ -16,11 +16,15 @@ AgentSearchRAGConfig, AgentSearchRAGInterface, ) -from synthesizer.interface.rag.google_search import ( - GoogleSearchRAGConfig, - GoogleSearchRAGInterface, +from synthesizer.interface.rag.bing_search import ( + BingRAGConfig, + BingRAGInterface, ) from synthesizer.interface.rag.local import LocalRAGInterface +from synthesizer.interface.rag.serp_api import ( + SERPSearchRAGConfig, + SERPSearchRAGInterface, +) from synthesizer.interface.rag_interface_manager import RAGInterfaceManager __all__ = [ @@ -42,6 +46,8 @@ "LocalRAGInterface", "AgentSearchRAGConfig", "AgentSearchRAGInterface", - "GoogleSearchRAGConfig", - "GoogleSearchRAGInterface", + "SERPSearchRAGConfig", + "SERPSearchRAGInterface", + "BingRAGConfig", + "BingRAGInterface", ] diff --git a/synthesizer/interface/base.py b/synthesizer/interface/base.py index fdba756..ef6323f 100644 --- a/synthesizer/interface/base.py +++ b/synthesizer/interface/base.py @@ -1,7 +1,7 @@ """A module which defines interface abstractions for various LLM providers.""" from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import Any, List, Optional, Type +from typing import Any, Dict, List, Optional, Type from synthesizer.core import LLMProviderName, RAGProviderName from synthesizer.llm import LLM, GenerationConfig, LLMConfig @@ -66,6 +66,12 @@ class RAGProviderConfig(ABC): api_key: Optional[str] = None +@dataclass +class RagResult(ABC): + context: str + meta_data: Optional[List[Dict[str, str]]] = None + + class RAGInterface(ABC): """An abstract class to provide a common interface for RAG providers.""" @@ -79,7 +85,7 @@ def __init__( self.config = config @abstractmethod - def get_rag_context(self, query: str) -> list[str]: + def get_rag_context(self, query: str) -> RagResult: """Get the context for a given query.""" pass diff --git a/synthesizer/interface/llm/sciphi_interface.py b/synthesizer/interface/llm/sciphi_interface.py index 776e79f..30b2f51 100644 --- a/synthesizer/interface/llm/sciphi_interface.py +++ b/synthesizer/interface/llm/sciphi_interface.py @@ -31,9 +31,7 @@ def get_completion( logger.debug( f"Getting completion from SciPhi API for model={generation_config.model_name}" ) - return self.model.get_instruct_completion( - prompt, generation_config - ) + return self.model.get_instruct_completion(prompt, generation_config) def get_chat_completion( self, conversation: list[dict], generation_config: GenerationConfig diff --git a/synthesizer/interface/rag/agent_search.py b/synthesizer/interface/rag/agent_search.py index 5db115e..c46f096 100644 --- a/synthesizer/interface/rag/agent_search.py +++ b/synthesizer/interface/rag/agent_search.py @@ -1,10 +1,13 @@ import os from dataclasses import dataclass -from agent_search.core import SERPClient - +from agent_search.core import AgentSearchClient from synthesizer.core import RAGProviderName -from synthesizer.interface.base import RAGInterface, RAGProviderConfig +from synthesizer.interface.base import ( + RAGInterface, + RAGProviderConfig, + RagResult, +) from synthesizer.interface.rag_interface_manager import ( rag_config, rag_provider, @@ -39,7 +42,7 @@ def __init__( ) -> None: super().__init__(config) self.config: AgentSearchRAGConfig = config - self.client = SERPClient(config.api_base) + self.client = AgentSearchClient(config.api_base) def get_rag_context(self, query) -> list[str]: """Get the context for a prompt.""" @@ -48,16 +51,19 @@ def get_rag_context(self, query) -> list[str]: raise ValueError( "No API key provided. Please provide an API key or set the SCIPHI_API_KEY environment variable." ) - results = self.client.search( + serp_results = self.client.search( query, self.config.limit_broad_results, self.config.limit_deduped_url_results, self.config.limit_hierarchical_url_results, self.config.limit_final_pagerank_results, ) - return "\n".join( - [ - f"{i+1}. URL: {result.url} (Score: {result.score:.2f})\nTitle:{result.title}\nSnippet:\n{result.text}" - for i, result in enumerate(results) - ] + return RagResult( + context="\n".join( + [ + f"{i+1}. URL: {result.url} (Score: {result.score:.2f})\nTitle:{result.title}\nSnippet:\n{result.text}" + for i, result in enumerate(serp_results) + ] + ), + meta_data=[ele.to_string_dict() for ele in serp_results], ) diff --git a/synthesizer/interface/rag/bing_search/__init__.py b/synthesizer/interface/rag/bing_search/__init__.py new file mode 100644 index 0000000..0fcc0fa --- /dev/null +++ b/synthesizer/interface/rag/bing_search/__init__.py @@ -0,0 +1,3 @@ +from .base import BingRAGConfig, BingRAGInterface + +__all__ = ["BingRAGConfig", "BingRAGInterface"] diff --git a/synthesizer/interface/rag/bing_search/base.py b/synthesizer/interface/rag/bing_search/base.py new file mode 100644 index 0000000..d8832ad --- /dev/null +++ b/synthesizer/interface/rag/bing_search/base.py @@ -0,0 +1,62 @@ +import os +from dataclasses import dataclass + +from synthesizer.core import RAGProviderName +from synthesizer.interface.base import ( + RAGInterface, + RAGProviderConfig, + RagResult, +) +from synthesizer.interface.rag_interface_manager import ( + rag_config, + rag_provider, +) + +from .bing_client import BingSearchClient # Import your BingSearchClient + + +@dataclass +@rag_config +class BingRAGConfig(RAGProviderConfig): + """Configuration for the Bing RAG provider.""" + + provider_name: RAGProviderName = RAGProviderName.BING + api_base: str = "https://api.bing.microsoft.com/v7.0/search" + limit_results: int = 30 + + +@rag_provider +class BingRAGInterface(RAGInterface): + """A RAG provider that uses Bing as the retrieval source.""" + + provider_name = RAGProviderName.BING + FORMAT_INDENT = " " + + def __init__( + self, config: BingRAGConfig = BingRAGConfig(), *args, **kwargs + ) -> None: + super().__init__(config) + self.config: BingRAGConfig = config + print('self.config = ', self.config) + api_key = self.config.api_key or os.getenv("BING_API_KEY") + if not api_key: + raise ValueError( + "No API key provided. Please provide an API key or set the BING_API_KEY environment variable." + ) + self.client = BingSearchClient(api_key) + + def get_rag_context(self, query) -> RagResult: + """Retrieve context for a given query using Bing.""" + results = self.client.search(query, self.config.limit_results) + serp_results = self.client.format_as_serp_results(results) + SPLIT_MARKER = "/" + context = "\n\n".join( + [ + f"{i+1}. URL: {SPLIT_MARKER.join(result.url.split(SPLIT_MARKER)[0:4])}\nTitle: {result.title}\nSnippet:\n{result.text}" + for i, result in enumerate(serp_results) + ] + ) + return RagResult( + context=context, + meta_data=[ele.to_string_dict() for ele in serp_results], + ) diff --git a/synthesizer/interface/rag/bing_search/bing_client.py b/synthesizer/interface/rag/bing_search/bing_client.py new file mode 100644 index 0000000..04ddf3c --- /dev/null +++ b/synthesizer/interface/rag/bing_search/bing_client.py @@ -0,0 +1,172 @@ +from typing import Any, Dict, List + +import requests + +from .bing_types import ( + Creator, + DisplayConfig, + Entity, + ImageInfo, + Publisher, + SearchResult, + Video, + WebPage, +) + + +class BingSearchClient: + FIELD_NAME_MAPPING = { + "name": "Title", + "description": "Snippet", + "snippet": "Snippet", + "url": "URL", + "contentUrl": "URL", + } + + def __init__(self, subscription_key: str): + self.subscription_key = subscription_key + self.search_url = "https://api.bing.microsoft.com/v7.0/search" + self.headers = {"Ocp-Apim-Subscription-Key": self.subscription_key} + + def search(self, query: str, count: int = 30) -> Dict[str, Any]: + params = { + "q": query, + "textDecorations": True, + "textFormat": "HTML", + "count": count, + } + response = requests.get( + self.search_url, headers=self.headers, params=params + ) + response.raise_for_status() + search_results = response.json() + + # Parse all types of data + parsed_data = { + "entities": self.parse_entities( + search_results.get("entities", {}) + ), + "related_queries": self.parse_related_queries( + search_results.get("relatedSearches", {}) + ), + "web_pages": self.parse_web_pages( + search_results.get("webPages", {}) + ), + "videos": self.parse_videos(search_results.get("videos", {})), + } + + return parsed_data + + def parse_entities(self, entities_data: Dict[str, Any]) -> List[Entity]: + entities = entities_data.get("value", []) + return [Entity.construct(**entity) for entity in entities] + + def parse_related_queries( + self, related_queries_data: Dict[str, Any] + ) -> List[str]: + queries = related_queries_data.get("value", []) + return [query.get("text", "N/A") for query in queries] + + def parse_web_pages(self, web_pages_data: Dict[str, Any]) -> List[WebPage]: + web_pages = web_pages_data.get("value", []) + return [WebPage.construct(**web_page) for web_page in web_pages] + + def parse_videos(self, videos_data: Dict[str, Any]) -> List[Video]: + videos = videos_data.get("value", []) + return [ + Video.construct( + webSearchUrl=video["webSearchUrl"], + name=video["name"], + description=video["description"], + thumbnail=ImageInfo( + thumbnailUrl=video["thumbnailUrl"], + hostPageUrl=video["hostPageUrl"], + width=video["width"], + height=video["height"], + sourceWidth=video.get( + "sourceWidth", video["width"] + ), # Assuming sourceWidth is same as width if not provided + sourceHeight=video.get( + "sourceHeight", video["height"] + ), # Assuming sourceHeight is same as height if not provided + ), + datePublished=video["datePublished"], + publisher=[ + Publisher(name=p["name"]) for p in video["publisher"] + ], + creator=Creator(name=video["creator"]["name"]) + if video.get("creator") + else None, + contentUrl=video["contentUrl"], + hostPageUrl=video["hostPageUrl"], + encodingFormat=video["encodingFormat"], + hostPageDisplayUrl=video["hostPageDisplayUrl"], + duration=video.get("duration"), + viewCount=video.get("viewCount"), + ) + for video in videos + ] + + def print_search_results( + self, search_results: Dict[str, Any], config: DisplayConfig + ) -> str: + output = [] + global_index = 1 # Initialize global index + + def format_item(item, fields): + nonlocal global_index + item_info = ", ".join( + f"{BingSearchClient.FIELD_NAME_MAPPING.get(field, field)}: {getattr(item, field)}" + for field in fields + ) + formatted_item = f"{global_index}.) {item_info}" + global_index += 1 + return formatted_item + + if config.show_entities and "entities" in search_results: + entities_output = ["Entities:"] + [ + format_item(entity, config.entity_fields) + for entity in search_results["entities"] + ] + output.append("\n".join(entities_output)) + + if config.show_related_queries and "related_queries" in search_results: + related_queries_output = ["Related Queries:"] + [ + f"{global_index}. {query}" + for query in search_results["related_queries"] + ] + global_index += len(search_results["related_queries"]) + output.append("\n".join(related_queries_output)) + + if config.show_web_pages and "web_pages" in search_results: + web_pages_output = ["Web Pages:"] + [ + format_item(web_page, config.web_page_fields) + for web_page in search_results["web_pages"] + ] + output.append("\n".join(web_pages_output)) + + if config.show_videos and "videos" in search_results: + videos_output = ["Videos:"] + [ + format_item(video, config.video_fields) + for video in search_results["videos"] + ] + output.append("\n".join(videos_output)) + + return "\n\n".join(output) + + def format_as_serp_results(self, search_results: Dict[str, Any]) -> str: + web_pages = search_results.get("web_pages", []) + + results = [] + for web_page in web_pages: + results.append( + SearchResult( + url=getattr(web_page, "url", ""), + title=getattr(web_page, "name", ""), + dataset=f"Bing Search", + metadata="", + text=getattr(web_page, "description", "") + or getattr(web_page, "snippet", ""), + ) + ) + return results diff --git a/synthesizer/interface/rag/bing_search/bing_types.py b/synthesizer/interface/rag/bing_search/bing_types.py new file mode 100644 index 0000000..68baa35 --- /dev/null +++ b/synthesizer/interface/rag/bing_search/bing_types.py @@ -0,0 +1,104 @@ +from datetime import datetime +from typing import List, Optional + +from pydantic import BaseModel, HttpUrl + + +class ImageInfo(BaseModel): + thumbnailUrl: HttpUrl + hostPageUrl: HttpUrl + width: int + height: int + sourceWidth: int + sourceHeight: int + + +class Entity(BaseModel): + id: HttpUrl + webSearchUrl: HttpUrl + name: str + url: Optional[HttpUrl] = None + description: str + bingId: Optional[str] = None + image: Optional[ImageInfo] = None + + +class DeepLink(BaseModel): + name: str + url: HttpUrl + snippet: Optional[str] = "" + + +class WebPage(BaseModel): + id: HttpUrl + name: str + url: HttpUrl + isFamilyFriendly: bool + displayUrl: str + snippet: str + dateLastCrawled: datetime + language: str + isNavigational: bool + deepLinks: Optional[List[DeepLink]] = [] + + +class Publisher(BaseModel): + name: str + + +class Creator(BaseModel): + name: str + + +class Video(BaseModel): + webSearchUrl: HttpUrl + name: str + description: str + thumbnail: ImageInfo # Replacing individual image fields with ImageInfo + datePublished: datetime + publisher: List[Publisher] + creator: Optional[Creator] + contentUrl: HttpUrl + hostPageUrl: HttpUrl + encodingFormat: str + hostPageDisplayUrl: HttpUrl + duration: Optional[str] + viewCount: Optional[int] + + +class DisplayConfig(BaseModel): + show_entities: bool = True + show_related_queries: bool = True + show_web_pages: bool = True + show_videos: bool = True + + # Add more specific configurations as needed, for example: + entity_fields: List[str] = ["name", "url", "description"] + web_page_fields: List[str] = ["name", "url", "snippet"] + video_fields: List[str] = ["name", "description", "contentUrl"] + + +class SearchResult(BaseModel): + """A dataclass to store the search result""" + + url: str + title: str + dataset: str + metadata: str + text: str + + def __init__(self, **data): + super().__init__(**data) + if self.title and self.title == self.text[0 : len(self.title)]: + self.text = self.text[len(self.title) :] + self.text = self.text.strip() + + def to_string_dict(self) -> dict: + """Returns a dictionary representation with all values as strings.""" + return { + "url": self.url, + "title": self.title, + "dataset": self.dataset, + "metadata": self.metadata, + "text": self.text, + } diff --git a/synthesizer/interface/rag/google_search.py b/synthesizer/interface/rag/serp_api.py similarity index 95% rename from synthesizer/interface/rag/google_search.py rename to synthesizer/interface/rag/serp_api.py index c803a39..be9feb4 100644 --- a/synthesizer/interface/rag/google_search.py +++ b/synthesizer/interface/rag/serp_api.py @@ -2,8 +2,6 @@ from dataclasses import dataclass from typing import Optional -from agent_search.core import SERPClient - from synthesizer.core import RAGProviderName from synthesizer.interface.base import RAGInterface, RAGProviderConfig from synthesizer.interface.rag_interface_manager import ( @@ -30,10 +28,10 @@ def call_search_engine(query, serpapi_api_key): @dataclass @rag_config -class GoogleSearchRAGConfig(RAGProviderConfig): +class SERPSearchRAGConfig(RAGProviderConfig): """An abstract class to hold the configuration for a RAG provider.""" - provider_name: RAGProviderName = RAGProviderName.GOOGLE_SEARCH + provider_name: RAGProviderName = RAGProviderName.SERP_API google_domain: str = "google.com" api_key: Optional[str] = None @@ -359,10 +357,6 @@ def freshprompt_format( answer_box = format_search_results({}) df = pd.concat([df, pd.DataFrame([answer_box])], ignore_index=True) - # Sort by date - # df["date"] = df["date"].apply(lambda x: format_date(x)) - # df["datetime"] = pd.to_datetime(df["date"], errors="coerce") - # df = df.sort_values(by="datetime", na_position="first") df.replace({pd.NaT: None}, inplace=True) df = df.dropna(how="all") @@ -386,15 +380,15 @@ def freshprompt_format( @rag_provider -class GoogleSearchRAGInterface(RAGInterface): +class SERPSearchRAGInterface(RAGInterface): """A RAG provider that uses Wikipedia as the retrieval source.""" - provider_name = RAGProviderName.GOOGLE_SEARCH + provider_name = RAGProviderName.SERP_API FORMAT_INDENT = " " def __init__( self, - config: GoogleSearchRAGConfig = GoogleSearchRAGConfig(), + config: SERPSearchRAGConfig = SERPSearchRAGConfig(), *args, **kwargs, ) -> None: @@ -411,7 +405,7 @@ def __init__( f"ImportError: {e}. Note, `python-dateutil` must be installed to run RAG with Google Search." ) super().__init__(config) - self.config: GoogleSearchRAGConfig = config + self.config: SERPSearchRAGConfig = config def get_rag_context(self, query) -> list[str]: """Get the context for a prompt.""" @@ -435,4 +429,3 @@ def get_rag_context(self, query) -> list[str]: num_questions_and_answers, num_retrieved_evidences, ) - diff --git a/synthesizer/scripts/run_rag.py b/synthesizer/scripts/run_rag.py index 0089a3d..13d119a 100644 --- a/synthesizer/scripts/run_rag.py +++ b/synthesizer/scripts/run_rag.py @@ -1,11 +1,10 @@ import json + +import fire + from synthesizer.core import LLMProviderName, RAGProviderName -from synthesizer.interface import ( - LLMInterfaceManager, - RAGInterfaceManager, -) +from synthesizer.interface import LLMInterfaceManager, RAGInterfaceManager from synthesizer.llm import GenerationConfig -import fire PROMPT = """ ### Instruction: @@ -47,10 +46,19 @@ def run( llm_temperature=0.1, llm_top_p=0.95, ): + # The following example is included to show the customizeability of the SciPhi API + # this workflow can be trivially replicated by calling the below: + + # from agent_search import SciPhi + # client = SciPhi() + # client.get_search_rag_response(query='latest news', search_provider='agent-search', llm_model='SciPhi/Sensei-7B-V1') + + # RAG Provider Settings rag_interface = RAGInterfaceManager.get_interface_from_args( RAGProviderName(rag_provider_name), api_base=rag_api_base, + # -- Customize your search response -- # limit_hierarchical_url_results=rag_limit_hierarchical_url_results, # limit_final_pagerank_results=rag_limit_final_pagerank_results, ) @@ -76,7 +84,7 @@ def run( print(json.loads(completion)) ### Output: - # {"summary": "\nFermat's Last Theorem is a significant result in number theory, stating that for any natural number n greater than 2, there are no solutions to the equation \\(a^n + b^n = c^n\\) where \\(a\\), \\(b\\), and \\(c\\) are positive integers [5]. The theorem was first proposed by Pierre de Fermat in the margins of his copy of Diophantus's \"Arithmetica\" in the 17th century, but it remained unproved for over three centuries [8]. The first case of the theorem to be proven was by Fermat himself for \\(n = 4\\), using a method of infinite descent [9]. Leonhard Euler later provided a proof for the case \\(n = 3\\), although his initial proof contained errors that were later corrected [9].\n\nThe theorem was finally proven in its entirety in 1995 by British mathematician Andrew Wiles, using sophisticated mathematical tools and techniques that were not available during Fermat's lifetime [10]. This breakthrough marked the end of a long period of mathematical speculation and the resolution of a major historical puzzle in mathematics [10]. The proof of Fermat's Last Theorem has been hailed as one of the most significant achievements in the history of mathematics, demonstrating the power of modern mathematical methods and the persistence of mathematical inquiry over centuries [10].\n\n", "other_queries": ["Details of Fermat's Last Theorem proof", "Historical impact of Fermat's Last Theorem", "Contributions of Andrew Wiles to mathematics", "Techniques used in the proof of Fermat's Last Theorem", "Evolution of number theory post-Fermat's Last Theorem"]} + # {"summary": "\nFermat's Last Theorem is a significant result in number theory, stating that for any natural number n greater than 2, there are no solutions to the equation \\(a^n + b^n = c^n\\) where \\(a\\), \\(b\\), and \\(c\\) are positive integers [5]. The theorem was first proposed by Pierre de Fermat in the margins of his copy of Diophantus's \"Arithmetica\" in the 17th century, but it remained unproved for over three centuries [8]. The first case of the theorem to be proven was by Fermat himself for \\(n = 4\\), using a method of infinite descent [9]. Leonhard Euler later provided a proof for the case \\(n = 3\\), although his initial proof contained errors that were later corrected [9].\n\nThe theorem was finally proven in its entirety in 1995 by British mathematician Andrew Wiles, using sophisticated mathematical tools and techniques that were not available during Fermat's lifetime [10]. This breakthrough marked the end of a long period of mathematical speculation and the resolution of a major historical puzzle in mathematics [10]. The proof of Fermat's Last Theorem has been hailed as one of the most significant achievements in the history of mathematics, demonstrating the power of modern mathematical methods and the persistence of mathematical inquiry over centuries [10].\n\n", "other_queries": ["Details of Fermat's Last Theorem proof", "Historical impact of Fermat's Last Theorem", "Contributions of Andrew Wiles to mathematics", "Techniques used in the proof of Fermat's Last Theorem", "Evolution of number theory post-Fermat's Last Theorem"]} if __name__ == "__main__":