Skip to content

Commit

Permalink
Openai Migration (#146)
Browse files Browse the repository at this point in the history
* migrate to latest openai version

* centralize all embedding functions in one spot

* remove openai[embeddings]

* specify openAI must be >v1

* Fix streaming and completions
  • Loading branch information
jerpint authored Nov 7, 2023
1 parent 7fce687 commit 6388f0d
Show file tree
Hide file tree
Showing 15 changed files with 105 additions and 88 deletions.
2 changes: 1 addition & 1 deletion buster/busterbot.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ class BusterConfig:
completion_cfg: dict = field(
default_factory=lambda: {
"completion_kwargs": {
"engine": "gpt-3.5-turbo",
"model": "gpt-3.5-turbo",
"temperature": 0,
"stream": True,
},
Expand Down
1 change: 0 additions & 1 deletion buster/completers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from abc import ABC, abstractmethod
from typing import Any, Iterator, Optional

import openai
import pandas as pd
from fastapi.encoders import jsonable_encoder

Expand Down
20 changes: 12 additions & 8 deletions buster/completers/chatgpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,19 @@
import os
from typing import Iterator

import openai
from openai import OpenAI

from buster.completers import Completer

client = OpenAI()

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

# Check if an API key exists for promptlayer, if it does, use it
promptlayer_api_key = os.environ.get("PROMPTLAYER_API_KEY")
if promptlayer_api_key:
# TODO: Check if this still works with latest openAI API...
try:
import promptlayer

Expand All @@ -38,11 +41,8 @@ def complete(self, prompt: str, user_input: str, completion_kwargs=None) -> (str

try:
error = False
response = openai.ChatCompletion.create(
messages=messages,
**completion_kwargs,
)
except openai.error.InvalidRequestError:
response = client.chat.completions.create(messages=messages, **completion_kwargs)
except openai.InvalidRequestError:
error = True
logger.exception("Invalid request to OpenAI API. See traceback:")
error_message = "Something went wrong with connecting with OpenAI, try again soon!"
Expand All @@ -59,11 +59,15 @@ def complete(self, prompt: str, user_input: str, completion_kwargs=None) -> (str
# openai response to be easier to handle later
def answer_generator():
for chunk in response:
token: str = chunk["choices"][0]["delta"].get("content", "")
token = chunk.choices[0].delta.content

# Always stream a string, openAI returns None on last token
token = "" if token is None else token

yield token

return answer_generator(), error

else:
full_response: str = response["choices"][0]["message"]["content"]
full_response: str = response.choices[0].message.content
return full_response, error
45 changes: 7 additions & 38 deletions buster/documents_manager/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,51 +4,20 @@
from dataclasses import dataclass
from typing import Optional

import numpy as np
import openai
import pandas as pd
from openai import OpenAI
from tqdm import tqdm
from tqdm.contrib.concurrent import process_map

from buster.llm_utils import compute_embeddings_parallelized, get_openai_embedding

client = OpenAI()

tqdm.pandas()

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


def get_embedding_openai(text: str, model="text-embedding-ada-002") -> np.ndarray:
text = text.replace("\n", " ")
try:
return np.array(openai.Embedding.create(input=[text], model=model)["data"][0]["embedding"], dtype=np.float32)
except:
# This rarely happens with the API but in the off chance it does, will allow us not to loose the progress.
logger.warning("Embedding failed to compute.")
return None


def compute_embeddings_parallelized(df: pd.DataFrame, embedding_fn: callable, num_workers: int) -> pd.Series:
"""Compute the embeddings on the 'content' column of a DataFrame in parallel.
This method calculates embeddings for the entries in the 'content' column of the provided DataFrame using the specified
embedding function. The 'content' column is expected to contain strings or textual data. The method processes the
embeddings in parallel using the number of workers specified.
Args:
df (pd.DataFrame): The DataFrame containing the data to compute embeddings for.
embedding_fn (callable): A function that computes embeddings for a given input string.
num_workers (int): The number of parallel workers to use for computing embeddings.
Returns:
pd.Series: A Series containing the computed embeddings for each entry in the 'content' column.
"""

logger.info(f"Computing embeddings of {len(df)} chunks. Using {num_workers=}")
embeddings = process_map(embedding_fn, df.content.to_list(), max_workers=num_workers)

logger.info(f"Finished computing embeddings")
return embeddings


@dataclass
class DocumentsManager(ABC):
def __init__(self, required_columns: Optional[list[str]] = None):
Expand Down Expand Up @@ -89,7 +58,7 @@ def add(
self,
df: pd.DataFrame,
num_workers: int = 16,
embedding_fn: callable = get_embedding_openai,
embedding_fn: callable = get_openai_embedding,
csv_filename: Optional[str] = None,
csv_overwrite: bool = True,
**add_kwargs,
Expand Down Expand Up @@ -133,7 +102,7 @@ def batch_add(
batch_size: int = 3000,
min_time_interval: int = 60,
num_workers: int = 16,
embedding_fn: callable = get_embedding_openai,
embedding_fn: callable = get_openai_embedding,
csv_filename: Optional[str] = None,
csv_overwrite: bool = False,
**add_kwargs,
Expand Down
1 change: 0 additions & 1 deletion buster/documents_manager/deeplake.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import logging
from typing import Optional

import openai
import pandas as pd

from buster.utils import zip_contents
Expand Down
7 changes: 6 additions & 1 deletion buster/llm_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
from buster.llm_utils.embeddings import (
compute_embeddings_parallelized,
cosine_similarity,
get_openai_embedding,
)
from buster.llm_utils.question_reformulator import QuestionReformulator

__all__ = [QuestionReformulator]
__all__ = [QuestionReformulator, cosine_similarity, get_openai_embedding, compute_embeddings_parallelized]
54 changes: 54 additions & 0 deletions buster/llm_utils/embeddings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import logging

import numpy as np
import pandas as pd
from openai import OpenAI
from tqdm.contrib.concurrent import process_map

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

client = OpenAI()


def get_openai_embedding(text: str, model: str = "text-embedding-ada-002"):
try:
text = text.replace("\n", " ")
response = client.embeddings.create(
input=text,
model=model,
)
embedding = response.data[0].embedding
return np.array(embedding, dtype="float32")
except Exception as e:
# This rarely happens with the API but in the off chance it does, will allow us not to loose the progress.
logger.exception(e)
logger.warning(f"Embedding failed to compute for {text=}")
return None


def cosine_similarity(a, b):
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


def compute_embeddings_parallelized(df: pd.DataFrame, embedding_fn: callable, num_workers: int) -> pd.Series:
"""Compute the embeddings on the 'content' column of a DataFrame in parallel.
This method calculates embeddings for the entries in the 'content' column of the provided DataFrame using the specified
embedding function. The 'content' column is expected to contain strings or textual data. The method processes the
embeddings in parallel using the number of workers specified.
Args:
df (pd.DataFrame): The DataFrame containing the data to compute embeddings for.
embedding_fn (callable): A function that computes embeddings for a given input string.
num_workers (int): The number of parallel workers to use for computing embeddings.
Returns:
pd.Series: A Series containing the computed embeddings for each entry in the 'content' column.
"""

logger.info(f"Computing embeddings of {len(df)} chunks. Using {num_workers=}")
embeddings = process_map(embedding_fn, df.content.to_list(), max_workers=num_workers)

logger.info(f"Finished computing embeddings")
return embeddings
4 changes: 2 additions & 2 deletions buster/retriever/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import pandas as pd

from buster.completers import UserInputs
from buster.documents_manager.base import get_embedding_openai
from buster.llm_utils import get_openai_embedding

ALL_SOURCES = "All"

Expand Down Expand Up @@ -41,7 +41,7 @@ def get_source_display_name(self, source: str) -> str:
@lru_cache
def get_embedding(query: str, model: str) -> np.ndarray:
logger.info("generating embedding")
return get_embedding_openai(query, model=model)
return get_openai_embedding(query, model=model)

@abstractmethod
def get_topk_documents(self, query: str, source: str = None, top_k: int = None) -> pd.DataFrame:
Expand Down
15 changes: 8 additions & 7 deletions buster/retriever/deeplake.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,13 +70,6 @@ def __init__(
from deeplake.core.vectorstore import VectorStore

super().__init__(**kwargs)
if activeloop_token is None:
logger.warning(
"""
No activeloop token detected, enterprise features will not be available.
You can set it using: export ACTIVELOOP_TOKEN=...
"""
)
self.use_tql = use_tql
self.exec_option = exec_option
self.deep_memory = deep_memory
Expand All @@ -87,6 +80,14 @@ def __init__(
exec_option=exec_option,
)

if activeloop_token is None and use_tql:
logger.warning(
"""
No activeloop token detected, enterprise features will not be available.
You can set it using: export ACTIVELOOP_TOKEN=...
"""
)

def get_documents(self, sources: Optional[list[str]] = None):
"""Get all current documents from a given source."""
k = len(self.vector_store)
Expand Down
12 changes: 0 additions & 12 deletions buster/utils.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,6 @@
import os
import urllib.request
import zipfile
from dataclasses import dataclass
from typing import Optional


@dataclass
class UserInputs:
original_input: str
reformulated_input: Optional[str] = None

@property
def current_input(self):
return self.reformulated_input if self.reformulated_input is not None else self.original_input


def get_file_extension(filepath: str) -> str:
Expand Down
9 changes: 5 additions & 4 deletions buster/validators/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
from functools import lru_cache

import pandas as pd
from openai.embeddings_utils import cosine_similarity, get_embedding

from buster.llm_utils import cosine_similarity, get_openai_embedding

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
Expand All @@ -24,10 +25,10 @@ def __init__(

@staticmethod
@lru_cache
def get_embedding(query: str, engine: str):
def get_embedding(text: str, model: str):
"""Currently supports OpenAI embeddings, override to add your own."""
logger.info("generating embedding")
return get_embedding(query, engine=engine)
return get_openai_embedding(text, model)

@abstractmethod
def check_question_relevance(self, question: str) -> tuple[bool, str]:
Expand All @@ -49,7 +50,7 @@ def rerank_docs(self, answer: str, matched_documents: pd.DataFrame) -> pd.DataFr

answer_embedding = self.get_embedding(
answer,
engine=self.embedding_model,
model=self.embedding_model,
)
col = "similarity_to_answer"
matched_documents[col] = matched_documents.embedding.apply(lambda x: cosine_similarity(x, answer_embedding))
Expand Down
7 changes: 3 additions & 4 deletions buster/validators/question_answer_validator.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import logging

from openai.embeddings_utils import cosine_similarity

from buster.completers import ChatGPTCompleter
from buster.llm_utils import cosine_similarity
from buster.validators import Validator

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -70,14 +69,14 @@ def check_answer_relevance(self, answer: str) -> bool:
unknown_embeddings = [
self.get_embedding(
unknown_response,
engine=self.embedding_model,
model=self.embedding_model,
)
for unknown_response in unknown_responses
]

answer_embedding = self.get_embedding(
answer,
engine=self.embedding_model,
model=self.embedding_model,
)
unknown_similarity_scores = [
cosine_similarity(answer_embedding, unknown_embedding) for unknown_embedding in unknown_embeddings
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ deeplake
gradio>=3.40
matplotlib
numpy
openai[embeddings]==0.28.1
openai>=1.0
pandas
pinecone-client
pymongo
Expand Down
10 changes: 4 additions & 6 deletions tests/test_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,8 @@
import pytest

from buster.documents_manager import DeepLakeDocumentsManager
from buster.documents_manager.base import (
compute_embeddings_parallelized,
get_embedding_openai,
)
from buster.documents_manager.base import compute_embeddings_parallelized
from buster.llm_utils import get_openai_embedding
from buster.retriever import DeepLakeRetriever

# Patch the get_embedding function to return a fixed, fake embedding
Expand Down Expand Up @@ -148,9 +146,9 @@ def test_generate_embeddings_parallelized():
)

embeddings_parallel = compute_embeddings_parallelized(
df, embedding_fn=get_embedding_openai, num_workers=NUM_WORKERS
df, embedding_fn=get_openai_embedding, num_workers=NUM_WORKERS
)
embeddings = df.content.apply(get_embedding_openai)
embeddings = df.content.apply(get_openai_embedding)

# embeddings comes out as a series because of the apply, so cast it back to an array
embeddings_arr = np.array(embeddings.to_list())
Expand Down
4 changes: 2 additions & 2 deletions tests/test_validator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pandas as pd
from openai.embeddings_utils import get_embedding

from buster.llm_utils import get_openai_embedding
from buster.validators import QuestionAnswerValidator, Validator

validator_cfg = {
Expand Down Expand Up @@ -42,7 +42,7 @@ def test_validator_rerank_docs():
]
matched_documents = pd.DataFrame({"documents": documents})
matched_documents["embedding"] = matched_documents.documents.apply(
lambda x: get_embedding(x, engine=validator.embedding_model)
lambda x: get_openai_embedding(x, model=validator.embedding_model)
)

answer = "An apple is a delicious fruit."
Expand Down

0 comments on commit 6388f0d

Please sign in to comment.