Openai Migration (#146)

* migrate to latest openai version * centralize all embedding functions in one spot * remove openai[embeddings] * specify openAI must be >v1 * Fix streaming and completions
jerpint · Nov 7, 2023 · 6388f0d · 6388f0d
1 parent 7fce687
commit 6388f0d
Show file tree

Hide file tree

Showing 15 changed files with 105 additions and 88 deletions.
diff --git a/buster/busterbot.py b/buster/busterbot.py
@@ -76,7 +76,7 @@ class BusterConfig:
     completion_cfg: dict = field(
         default_factory=lambda: {
             "completion_kwargs": {
-                "engine": "gpt-3.5-turbo",
+                "model": "gpt-3.5-turbo",
                 "temperature": 0,
                 "stream": True,
             },

diff --git a/buster/completers/base.py b/buster/completers/base.py
@@ -4,7 +4,6 @@
 from abc import ABC, abstractmethod
 from typing import Any, Iterator, Optional
 
-import openai
 import pandas as pd
 from fastapi.encoders import jsonable_encoder
 

diff --git a/buster/completers/chatgpt.py b/buster/completers/chatgpt.py
@@ -2,16 +2,19 @@
 import os
 from typing import Iterator
 
-import openai
+from openai import OpenAI
 
 from buster.completers import Completer
 
+client = OpenAI()
+
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
 
 # Check if an API key exists for promptlayer, if it does, use it
 promptlayer_api_key = os.environ.get("PROMPTLAYER_API_KEY")
 if promptlayer_api_key:
+    # TODO: Check if this still works with latest openAI API...
     try:
         import promptlayer
 
@@ -38,11 +41,8 @@ def complete(self, prompt: str, user_input: str, completion_kwargs=None) -> (str
 
         try:
             error = False
-            response = openai.ChatCompletion.create(
-                messages=messages,
-                **completion_kwargs,
-            )
-        except openai.error.InvalidRequestError:
+            response = client.chat.completions.create(messages=messages, **completion_kwargs)
+        except openai.InvalidRequestError:
             error = True
             logger.exception("Invalid request to OpenAI API. See traceback:")
             error_message = "Something went wrong with connecting with OpenAI, try again soon!"
@@ -59,11 +59,15 @@ def complete(self, prompt: str, user_input: str, completion_kwargs=None) -> (str
             # openai response to be easier to handle later
             def answer_generator():
                 for chunk in response:
-                    token: str = chunk["choices"][0]["delta"].get("content", "")
+                    token = chunk.choices[0].delta.content
+
+                    # Always stream a string, openAI returns None on last token
+                    token = "" if token is None else token
+
                     yield token
 
             return answer_generator(), error
 
         else:
-            full_response: str = response["choices"][0]["message"]["content"]
+            full_response: str = response.choices[0].message.content
             return full_response, error
diff --git a/buster/documents_manager/base.py b/buster/documents_manager/base.py
@@ -4,51 +4,20 @@
 from dataclasses import dataclass
 from typing import Optional
 
-import numpy as np
-import openai
 import pandas as pd
+from openai import OpenAI
 from tqdm import tqdm
-from tqdm.contrib.concurrent import process_map
+
+from buster.llm_utils import compute_embeddings_parallelized, get_openai_embedding
+
+client = OpenAI()
 
 tqdm.pandas()
 
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
 
 
-def get_embedding_openai(text: str, model="text-embedding-ada-002") -> np.ndarray:
-    text = text.replace("\n", " ")
-    try:
-        return np.array(openai.Embedding.create(input=[text], model=model)["data"][0]["embedding"], dtype=np.float32)
-    except:
-        # This rarely happens with the API but in the off chance it does, will allow us not to loose the progress.
-        logger.warning("Embedding failed to compute.")
-        return None
-
-
-def compute_embeddings_parallelized(df: pd.DataFrame, embedding_fn: callable, num_workers: int) -> pd.Series:
-    """Compute the embeddings on the 'content' column of a DataFrame in parallel.
-
-    This method calculates embeddings for the entries in the 'content' column of the provided DataFrame using the specified
-    embedding function. The 'content' column is expected to contain strings or textual data. The method processes the
-    embeddings in parallel using the number of workers specified.
-
-    Args:
-        df (pd.DataFrame): The DataFrame containing the data to compute embeddings for.
-        embedding_fn (callable): A function that computes embeddings for a given input string.
-        num_workers (int): The number of parallel workers to use for computing embeddings.
-
-    Returns:
-        pd.Series: A Series containing the computed embeddings for each entry in the 'content' column.
-    """
-
-    logger.info(f"Computing embeddings of {len(df)} chunks. Using {num_workers=}")
-    embeddings = process_map(embedding_fn, df.content.to_list(), max_workers=num_workers)
-
-    logger.info(f"Finished computing embeddings")
-    return embeddings
-
-
 @dataclass
 class DocumentsManager(ABC):
     def __init__(self, required_columns: Optional[list[str]] = None):
@@ -89,7 +58,7 @@ def add(
         self,
         df: pd.DataFrame,
         num_workers: int = 16,
-        embedding_fn: callable = get_embedding_openai,
+        embedding_fn: callable = get_openai_embedding,
         csv_filename: Optional[str] = None,
         csv_overwrite: bool = True,
         **add_kwargs,
@@ -133,7 +102,7 @@ def batch_add(
         batch_size: int = 3000,
         min_time_interval: int = 60,
         num_workers: int = 16,
-        embedding_fn: callable = get_embedding_openai,
+        embedding_fn: callable = get_openai_embedding,
         csv_filename: Optional[str] = None,
         csv_overwrite: bool = False,
         **add_kwargs,

diff --git a/buster/documents_manager/deeplake.py b/buster/documents_manager/deeplake.py
@@ -1,7 +1,6 @@
 import logging
 from typing import Optional
 
-import openai
 import pandas as pd
 
 from buster.utils import zip_contents

diff --git a/buster/llm_utils/__init__.py b/buster/llm_utils/__init__.py
@@ -1,3 +1,8 @@
+from buster.llm_utils.embeddings import (
+    compute_embeddings_parallelized,
+    cosine_similarity,
+    get_openai_embedding,
+)
 from buster.llm_utils.question_reformulator import QuestionReformulator
 
-__all__ = [QuestionReformulator]
+__all__ = [QuestionReformulator, cosine_similarity, get_openai_embedding, compute_embeddings_parallelized]
diff --git a/buster/llm_utils/embeddings.py b/buster/llm_utils/embeddings.py
@@ -0,0 +1,54 @@
+import logging
+
+import numpy as np
+import pandas as pd
+from openai import OpenAI
+from tqdm.contrib.concurrent import process_map
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+client = OpenAI()
+
+
+def get_openai_embedding(text: str, model: str = "text-embedding-ada-002"):
+    try:
+        text = text.replace("\n", " ")
+        response = client.embeddings.create(
+            input=text,
+            model=model,
+        )
+        embedding = response.data[0].embedding
+        return np.array(embedding, dtype="float32")
+    except Exception as e:
+        # This rarely happens with the API but in the off chance it does, will allow us not to loose the progress.
+        logger.exception(e)
+        logger.warning(f"Embedding failed to compute for {text=}")
+        return None
+
+
+def cosine_similarity(a, b):
+    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
+
+
+def compute_embeddings_parallelized(df: pd.DataFrame, embedding_fn: callable, num_workers: int) -> pd.Series:
+    """Compute the embeddings on the 'content' column of a DataFrame in parallel.
+
+    This method calculates embeddings for the entries in the 'content' column of the provided DataFrame using the specified
+    embedding function. The 'content' column is expected to contain strings or textual data. The method processes the
+    embeddings in parallel using the number of workers specified.
+
+    Args:
+        df (pd.DataFrame): The DataFrame containing the data to compute embeddings for.
+        embedding_fn (callable): A function that computes embeddings for a given input string.
+        num_workers (int): The number of parallel workers to use for computing embeddings.
+
+    Returns:
+        pd.Series: A Series containing the computed embeddings for each entry in the 'content' column.
+    """
+
+    logger.info(f"Computing embeddings of {len(df)} chunks. Using {num_workers=}")
+    embeddings = process_map(embedding_fn, df.content.to_list(), max_workers=num_workers)
+
+    logger.info(f"Finished computing embeddings")
+    return embeddings
diff --git a/buster/retriever/base.py b/buster/retriever/base.py
@@ -8,7 +8,7 @@
 import pandas as pd
 
 from buster.completers import UserInputs
-from buster.documents_manager.base import get_embedding_openai
+from buster.llm_utils import get_openai_embedding
 
 ALL_SOURCES = "All"
 
@@ -41,7 +41,7 @@ def get_source_display_name(self, source: str) -> str:
     @lru_cache
     def get_embedding(query: str, model: str) -> np.ndarray:
         logger.info("generating embedding")
-        return get_embedding_openai(query, model=model)
+        return get_openai_embedding(query, model=model)
 
     @abstractmethod
     def get_topk_documents(self, query: str, source: str = None, top_k: int = None) -> pd.DataFrame:

diff --git a/buster/retriever/deeplake.py b/buster/retriever/deeplake.py
@@ -70,13 +70,6 @@ def __init__(
         from deeplake.core.vectorstore import VectorStore
 
         super().__init__(**kwargs)
-        if activeloop_token is None:
-            logger.warning(
-                """
-                No activeloop token detected, enterprise features will not be available.
-                You can set it using: export ACTIVELOOP_TOKEN=...
-                """
-            )
         self.use_tql = use_tql
         self.exec_option = exec_option
         self.deep_memory = deep_memory
@@ -87,6 +80,14 @@ def __init__(
             exec_option=exec_option,
         )
 
+        if activeloop_token is None and use_tql:
+            logger.warning(
+                """
+                No activeloop token detected, enterprise features will not be available.
+                You can set it using: export ACTIVELOOP_TOKEN=...
+                """
+            )
+
     def get_documents(self, sources: Optional[list[str]] = None):
         """Get all current documents from a given source."""
         k = len(self.vector_store)

diff --git a/buster/utils.py b/buster/utils.py
@@ -1,18 +1,6 @@
 import os
 import urllib.request
 import zipfile
-from dataclasses import dataclass
-from typing import Optional
-
-
-@dataclass
-class UserInputs:
-    original_input: str
-    reformulated_input: Optional[str] = None
-
-    @property
-    def current_input(self):
-        return self.reformulated_input if self.reformulated_input is not None else self.original_input
 
 
 def get_file_extension(filepath: str) -> str:

diff --git a/buster/validators/base.py b/buster/validators/base.py
@@ -3,7 +3,8 @@
 from functools import lru_cache
 
 import pandas as pd
-from openai.embeddings_utils import cosine_similarity, get_embedding
+
+from buster.llm_utils import cosine_similarity, get_openai_embedding
 
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
@@ -24,10 +25,10 @@ def __init__(
 
     @staticmethod
     @lru_cache
-    def get_embedding(query: str, engine: str):
+    def get_embedding(text: str, model: str):
         """Currently supports OpenAI embeddings, override to add your own."""
         logger.info("generating embedding")
-        return get_embedding(query, engine=engine)
+        return get_openai_embedding(text, model)
 
     @abstractmethod
     def check_question_relevance(self, question: str) -> tuple[bool, str]:
@@ -49,7 +50,7 @@ def rerank_docs(self, answer: str, matched_documents: pd.DataFrame) -> pd.DataFr
 
         answer_embedding = self.get_embedding(
             answer,
-            engine=self.embedding_model,
+            model=self.embedding_model,
         )
         col = "similarity_to_answer"
         matched_documents[col] = matched_documents.embedding.apply(lambda x: cosine_similarity(x, answer_embedding))

diff --git a/buster/validators/question_answer_validator.py b/buster/validators/question_answer_validator.py
@@ -1,8 +1,7 @@
 import logging
 
-from openai.embeddings_utils import cosine_similarity
-
 from buster.completers import ChatGPTCompleter
+from buster.llm_utils import cosine_similarity
 from buster.validators import Validator
 
 logger = logging.getLogger(__name__)
@@ -70,14 +69,14 @@ def check_answer_relevance(self, answer: str) -> bool:
         unknown_embeddings = [
             self.get_embedding(
                 unknown_response,
-                engine=self.embedding_model,
+                model=self.embedding_model,
             )
             for unknown_response in unknown_responses
         ]
 
         answer_embedding = self.get_embedding(
             answer,
-            engine=self.embedding_model,
+            model=self.embedding_model,
         )
         unknown_similarity_scores = [
             cosine_similarity(answer_embedding, unknown_embedding) for unknown_embedding in unknown_embeddings

diff --git a/requirements.txt b/requirements.txt
@@ -4,7 +4,7 @@ deeplake
 gradio>=3.40
 matplotlib
 numpy
-openai[embeddings]==0.28.1
+openai>=1.0
 pandas
 pinecone-client
 pymongo

diff --git a/tests/test_documents.py b/tests/test_documents.py
@@ -5,10 +5,8 @@
 import pytest
 
 from buster.documents_manager import DeepLakeDocumentsManager
-from buster.documents_manager.base import (
-    compute_embeddings_parallelized,
-    get_embedding_openai,
-)
+from buster.documents_manager.base import compute_embeddings_parallelized
+from buster.llm_utils import get_openai_embedding
 from buster.retriever import DeepLakeRetriever
 
 # Patch the get_embedding function to return a fixed, fake embedding
@@ -148,9 +146,9 @@ def test_generate_embeddings_parallelized():
     )
 
     embeddings_parallel = compute_embeddings_parallelized(
-        df, embedding_fn=get_embedding_openai, num_workers=NUM_WORKERS
+        df, embedding_fn=get_openai_embedding, num_workers=NUM_WORKERS
     )
-    embeddings = df.content.apply(get_embedding_openai)
+    embeddings = df.content.apply(get_openai_embedding)
 
     # embeddings comes out as a series because of the apply, so cast it back to an array
     embeddings_arr = np.array(embeddings.to_list())

diff --git a/tests/test_validator.py b/tests/test_validator.py
@@ -1,6 +1,6 @@
 import pandas as pd
-from openai.embeddings_utils import get_embedding
 
+from buster.llm_utils import get_openai_embedding
 from buster.validators import QuestionAnswerValidator, Validator
 
 validator_cfg = {
@@ -42,7 +42,7 @@ def test_validator_rerank_docs():
     ]
     matched_documents = pd.DataFrame({"documents": documents})
     matched_documents["embedding"] = matched_documents.documents.apply(
-        lambda x: get_embedding(x, engine=validator.embedding_model)
+        lambda x: get_openai_embedding(x, model=validator.embedding_model)
     )
 
     answer = "An apple is a delicious fruit."