diff --git a/buster/busterbot.py b/buster/busterbot.py index e321849..638c28c 100644 --- a/buster/busterbot.py +++ b/buster/busterbot.py @@ -76,7 +76,7 @@ class BusterConfig: completion_cfg: dict = field( default_factory=lambda: { "completion_kwargs": { - "engine": "gpt-3.5-turbo", + "model": "gpt-3.5-turbo", "temperature": 0, "stream": True, }, diff --git a/buster/completers/base.py b/buster/completers/base.py index e6b989b..24d1a91 100644 --- a/buster/completers/base.py +++ b/buster/completers/base.py @@ -4,7 +4,6 @@ from abc import ABC, abstractmethod from typing import Any, Iterator, Optional -import openai import pandas as pd from fastapi.encoders import jsonable_encoder diff --git a/buster/completers/chatgpt.py b/buster/completers/chatgpt.py index 6e9ed6b..51deba8 100644 --- a/buster/completers/chatgpt.py +++ b/buster/completers/chatgpt.py @@ -2,16 +2,19 @@ import os from typing import Iterator -import openai +from openai import OpenAI from buster.completers import Completer +client = OpenAI() + logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) # Check if an API key exists for promptlayer, if it does, use it promptlayer_api_key = os.environ.get("PROMPTLAYER_API_KEY") if promptlayer_api_key: + # TODO: Check if this still works with latest openAI API... try: import promptlayer @@ -38,11 +41,8 @@ def complete(self, prompt: str, user_input: str, completion_kwargs=None) -> (str try: error = False - response = openai.ChatCompletion.create( - messages=messages, - **completion_kwargs, - ) - except openai.error.InvalidRequestError: + response = client.chat.completions.create(messages=messages, **completion_kwargs) + except openai.InvalidRequestError: error = True logger.exception("Invalid request to OpenAI API. See traceback:") error_message = "Something went wrong with connecting with OpenAI, try again soon!" @@ -59,11 +59,15 @@ def complete(self, prompt: str, user_input: str, completion_kwargs=None) -> (str # openai response to be easier to handle later def answer_generator(): for chunk in response: - token: str = chunk["choices"][0]["delta"].get("content", "") + token = chunk.choices[0].delta.content + + # Always stream a string, openAI returns None on last token + token = "" if token is None else token + yield token return answer_generator(), error else: - full_response: str = response["choices"][0]["message"]["content"] + full_response: str = response.choices[0].message.content return full_response, error diff --git a/buster/documents_manager/base.py b/buster/documents_manager/base.py index 584f26f..22b9f86 100644 --- a/buster/documents_manager/base.py +++ b/buster/documents_manager/base.py @@ -4,11 +4,13 @@ from dataclasses import dataclass from typing import Optional -import numpy as np -import openai import pandas as pd +from openai import OpenAI from tqdm import tqdm -from tqdm.contrib.concurrent import process_map + +from buster.llm_utils import compute_embeddings_parallelized, get_openai_embedding + +client = OpenAI() tqdm.pandas() @@ -16,39 +18,6 @@ logging.basicConfig(level=logging.INFO) -def get_embedding_openai(text: str, model="text-embedding-ada-002") -> np.ndarray: - text = text.replace("\n", " ") - try: - return np.array(openai.Embedding.create(input=[text], model=model)["data"][0]["embedding"], dtype=np.float32) - except: - # This rarely happens with the API but in the off chance it does, will allow us not to loose the progress. - logger.warning("Embedding failed to compute.") - return None - - -def compute_embeddings_parallelized(df: pd.DataFrame, embedding_fn: callable, num_workers: int) -> pd.Series: - """Compute the embeddings on the 'content' column of a DataFrame in parallel. - - This method calculates embeddings for the entries in the 'content' column of the provided DataFrame using the specified - embedding function. The 'content' column is expected to contain strings or textual data. The method processes the - embeddings in parallel using the number of workers specified. - - Args: - df (pd.DataFrame): The DataFrame containing the data to compute embeddings for. - embedding_fn (callable): A function that computes embeddings for a given input string. - num_workers (int): The number of parallel workers to use for computing embeddings. - - Returns: - pd.Series: A Series containing the computed embeddings for each entry in the 'content' column. - """ - - logger.info(f"Computing embeddings of {len(df)} chunks. Using {num_workers=}") - embeddings = process_map(embedding_fn, df.content.to_list(), max_workers=num_workers) - - logger.info(f"Finished computing embeddings") - return embeddings - - @dataclass class DocumentsManager(ABC): def __init__(self, required_columns: Optional[list[str]] = None): @@ -89,7 +58,7 @@ def add( self, df: pd.DataFrame, num_workers: int = 16, - embedding_fn: callable = get_embedding_openai, + embedding_fn: callable = get_openai_embedding, csv_filename: Optional[str] = None, csv_overwrite: bool = True, **add_kwargs, @@ -133,7 +102,7 @@ def batch_add( batch_size: int = 3000, min_time_interval: int = 60, num_workers: int = 16, - embedding_fn: callable = get_embedding_openai, + embedding_fn: callable = get_openai_embedding, csv_filename: Optional[str] = None, csv_overwrite: bool = False, **add_kwargs, diff --git a/buster/documents_manager/deeplake.py b/buster/documents_manager/deeplake.py index ce3ea73..990a4f5 100644 --- a/buster/documents_manager/deeplake.py +++ b/buster/documents_manager/deeplake.py @@ -1,7 +1,6 @@ import logging from typing import Optional -import openai import pandas as pd from buster.utils import zip_contents diff --git a/buster/llm_utils/__init__.py b/buster/llm_utils/__init__.py index 1d0411e..f1c6be2 100644 --- a/buster/llm_utils/__init__.py +++ b/buster/llm_utils/__init__.py @@ -1,3 +1,8 @@ +from buster.llm_utils.embeddings import ( + compute_embeddings_parallelized, + cosine_similarity, + get_openai_embedding, +) from buster.llm_utils.question_reformulator import QuestionReformulator -__all__ = [QuestionReformulator] +__all__ = [QuestionReformulator, cosine_similarity, get_openai_embedding, compute_embeddings_parallelized] diff --git a/buster/llm_utils/embeddings.py b/buster/llm_utils/embeddings.py new file mode 100644 index 0000000..190378f --- /dev/null +++ b/buster/llm_utils/embeddings.py @@ -0,0 +1,54 @@ +import logging + +import numpy as np +import pandas as pd +from openai import OpenAI +from tqdm.contrib.concurrent import process_map + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + +client = OpenAI() + + +def get_openai_embedding(text: str, model: str = "text-embedding-ada-002"): + try: + text = text.replace("\n", " ") + response = client.embeddings.create( + input=text, + model=model, + ) + embedding = response.data[0].embedding + return np.array(embedding, dtype="float32") + except Exception as e: + # This rarely happens with the API but in the off chance it does, will allow us not to loose the progress. + logger.exception(e) + logger.warning(f"Embedding failed to compute for {text=}") + return None + + +def cosine_similarity(a, b): + return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) + + +def compute_embeddings_parallelized(df: pd.DataFrame, embedding_fn: callable, num_workers: int) -> pd.Series: + """Compute the embeddings on the 'content' column of a DataFrame in parallel. + + This method calculates embeddings for the entries in the 'content' column of the provided DataFrame using the specified + embedding function. The 'content' column is expected to contain strings or textual data. The method processes the + embeddings in parallel using the number of workers specified. + + Args: + df (pd.DataFrame): The DataFrame containing the data to compute embeddings for. + embedding_fn (callable): A function that computes embeddings for a given input string. + num_workers (int): The number of parallel workers to use for computing embeddings. + + Returns: + pd.Series: A Series containing the computed embeddings for each entry in the 'content' column. + """ + + logger.info(f"Computing embeddings of {len(df)} chunks. Using {num_workers=}") + embeddings = process_map(embedding_fn, df.content.to_list(), max_workers=num_workers) + + logger.info(f"Finished computing embeddings") + return embeddings diff --git a/buster/retriever/base.py b/buster/retriever/base.py index 5af0a97..0436ae1 100644 --- a/buster/retriever/base.py +++ b/buster/retriever/base.py @@ -8,7 +8,7 @@ import pandas as pd from buster.completers import UserInputs -from buster.documents_manager.base import get_embedding_openai +from buster.llm_utils import get_openai_embedding ALL_SOURCES = "All" @@ -41,7 +41,7 @@ def get_source_display_name(self, source: str) -> str: @lru_cache def get_embedding(query: str, model: str) -> np.ndarray: logger.info("generating embedding") - return get_embedding_openai(query, model=model) + return get_openai_embedding(query, model=model) @abstractmethod def get_topk_documents(self, query: str, source: str = None, top_k: int = None) -> pd.DataFrame: diff --git a/buster/retriever/deeplake.py b/buster/retriever/deeplake.py index 502dcb9..236626a 100644 --- a/buster/retriever/deeplake.py +++ b/buster/retriever/deeplake.py @@ -70,13 +70,6 @@ def __init__( from deeplake.core.vectorstore import VectorStore super().__init__(**kwargs) - if activeloop_token is None: - logger.warning( - """ - No activeloop token detected, enterprise features will not be available. - You can set it using: export ACTIVELOOP_TOKEN=... - """ - ) self.use_tql = use_tql self.exec_option = exec_option self.deep_memory = deep_memory @@ -87,6 +80,14 @@ def __init__( exec_option=exec_option, ) + if activeloop_token is None and use_tql: + logger.warning( + """ + No activeloop token detected, enterprise features will not be available. + You can set it using: export ACTIVELOOP_TOKEN=... + """ + ) + def get_documents(self, sources: Optional[list[str]] = None): """Get all current documents from a given source.""" k = len(self.vector_store) diff --git a/buster/utils.py b/buster/utils.py index 978a4ec..398dfaa 100644 --- a/buster/utils.py +++ b/buster/utils.py @@ -1,18 +1,6 @@ import os import urllib.request import zipfile -from dataclasses import dataclass -from typing import Optional - - -@dataclass -class UserInputs: - original_input: str - reformulated_input: Optional[str] = None - - @property - def current_input(self): - return self.reformulated_input if self.reformulated_input is not None else self.original_input def get_file_extension(filepath: str) -> str: diff --git a/buster/validators/base.py b/buster/validators/base.py index 7e134eb..3cffb4c 100644 --- a/buster/validators/base.py +++ b/buster/validators/base.py @@ -3,7 +3,8 @@ from functools import lru_cache import pandas as pd -from openai.embeddings_utils import cosine_similarity, get_embedding + +from buster.llm_utils import cosine_similarity, get_openai_embedding logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) @@ -24,10 +25,10 @@ def __init__( @staticmethod @lru_cache - def get_embedding(query: str, engine: str): + def get_embedding(text: str, model: str): """Currently supports OpenAI embeddings, override to add your own.""" logger.info("generating embedding") - return get_embedding(query, engine=engine) + return get_openai_embedding(text, model) @abstractmethod def check_question_relevance(self, question: str) -> tuple[bool, str]: @@ -49,7 +50,7 @@ def rerank_docs(self, answer: str, matched_documents: pd.DataFrame) -> pd.DataFr answer_embedding = self.get_embedding( answer, - engine=self.embedding_model, + model=self.embedding_model, ) col = "similarity_to_answer" matched_documents[col] = matched_documents.embedding.apply(lambda x: cosine_similarity(x, answer_embedding)) diff --git a/buster/validators/question_answer_validator.py b/buster/validators/question_answer_validator.py index ccf0b32..bccefe7 100644 --- a/buster/validators/question_answer_validator.py +++ b/buster/validators/question_answer_validator.py @@ -1,8 +1,7 @@ import logging -from openai.embeddings_utils import cosine_similarity - from buster.completers import ChatGPTCompleter +from buster.llm_utils import cosine_similarity from buster.validators import Validator logger = logging.getLogger(__name__) @@ -70,14 +69,14 @@ def check_answer_relevance(self, answer: str) -> bool: unknown_embeddings = [ self.get_embedding( unknown_response, - engine=self.embedding_model, + model=self.embedding_model, ) for unknown_response in unknown_responses ] answer_embedding = self.get_embedding( answer, - engine=self.embedding_model, + model=self.embedding_model, ) unknown_similarity_scores = [ cosine_similarity(answer_embedding, unknown_embedding) for unknown_embedding in unknown_embeddings diff --git a/requirements.txt b/requirements.txt index 4de90d9..92f03e4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ deeplake gradio>=3.40 matplotlib numpy -openai[embeddings]==0.28.1 +openai>=1.0 pandas pinecone-client pymongo diff --git a/tests/test_documents.py b/tests/test_documents.py index a5ff306..c4b25e1 100644 --- a/tests/test_documents.py +++ b/tests/test_documents.py @@ -5,10 +5,8 @@ import pytest from buster.documents_manager import DeepLakeDocumentsManager -from buster.documents_manager.base import ( - compute_embeddings_parallelized, - get_embedding_openai, -) +from buster.documents_manager.base import compute_embeddings_parallelized +from buster.llm_utils import get_openai_embedding from buster.retriever import DeepLakeRetriever # Patch the get_embedding function to return a fixed, fake embedding @@ -148,9 +146,9 @@ def test_generate_embeddings_parallelized(): ) embeddings_parallel = compute_embeddings_parallelized( - df, embedding_fn=get_embedding_openai, num_workers=NUM_WORKERS + df, embedding_fn=get_openai_embedding, num_workers=NUM_WORKERS ) - embeddings = df.content.apply(get_embedding_openai) + embeddings = df.content.apply(get_openai_embedding) # embeddings comes out as a series because of the apply, so cast it back to an array embeddings_arr = np.array(embeddings.to_list()) diff --git a/tests/test_validator.py b/tests/test_validator.py index 4c7dcff..718d33d 100644 --- a/tests/test_validator.py +++ b/tests/test_validator.py @@ -1,6 +1,6 @@ import pandas as pd -from openai.embeddings_utils import get_embedding +from buster.llm_utils import get_openai_embedding from buster.validators import QuestionAnswerValidator, Validator validator_cfg = { @@ -42,7 +42,7 @@ def test_validator_rerank_docs(): ] matched_documents = pd.DataFrame({"documents": documents}) matched_documents["embedding"] = matched_documents.documents.apply( - lambda x: get_embedding(x, engine=validator.embedding_model) + lambda x: get_openai_embedding(x, model=validator.embedding_model) ) answer = "An apple is a delicious fruit."